Merge branch 'akpm-current/current'

author: Stephen Rothwell <sfr@canb.auug.org.au> 2016-12-20 11:54:45 +1100
committer: Stephen Rothwell <sfr@canb.auug.org.au> 2016-12-20 11:54:45 +1100
commit: 09fc92cf819f627c5c6c2fe4b8921950d9f81275 (patch)
tree: 9d43e3644f926bc169ab2a46945d6d61a8263247
parent: b8317fd612ceb1d118275e92f66ded6fdf0a2023 (diff)
parent: e1f8c36b6eb3b4b793064ecd4f440c02e7fdbe00 (diff)
57 files changed, 1913 insertions, 293 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index be2d6d0a03a4..21e2d8863705 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1441,6 +1441,10 @@
 			The builtin appraise policy appraises all files
 			owned by uid=0.
 
+	ima_canonical_fmt [IMA]
+			Use the canonical format for the binary runtime
+			measurements, instead of host native format.
+
 	ima_hash=	[IMA]
 			Format: { md5 | sha1 | rmd160 | sha256 | sha384
 				   | sha512 | ... }
diff --git a/arch/Kconfig b/arch/Kconfig
index 19483aea4bbc..99839c23d453 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -5,6 +5,9 @@
 config KEXEC_CORE
 	bool
 
+config HAVE_IMA_KEXEC
+	bool
+
 config OPROFILE
 	tristate "OProfile system profiling"
 	depends on PROFILING
diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h
index 4355f0ec44d6..f98baaec0a15 100644
--- a/arch/arm/include/asm/page.h
+++ b/arch/arm/include/asm/page.h
@@ -17,6 +17,8 @@
 
 #ifndef __ASSEMBLY__
 
+#include <linux/personality.h>	/* For READ_IMPLIES_EXEC */
+
 #ifndef CONFIG_MMU
 
 #include <asm/page-nommu.h>
diff --git a/arch/arm64/include/asm/setup.h b/arch/arm64/include/asm/setup.h
new file mode 100644
index 000000000000..e7b59b930ce0
--- /dev/null
+++ b/arch/arm64/include/asm/setup.h
@@ -0,0 +1,19 @@
+/*
+ * arch/arm64/include/asm/setup.h
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __ASM_SETUP_H
+#define __ASM_SETUP_H
+
+#include <uapi/asm/setup.h>
+
+static inline unsigned long kaslr_offset(void)
+{
+	return kimage_vaddr - KIMAGE_VADDR;
+}
+
+#endif
diff --git a/arch/arm64/include/uapi/asm/setup.h b/arch/arm64/include/uapi/asm/setup.h
index 9cf2e46fbbdf..26631c8f0abc 100644
--- a/arch/arm64/include/uapi/asm/setup.h
+++ b/arch/arm64/include/uapi/asm/setup.h
@@ -16,8 +16,8 @@
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef __ASM_SETUP_H
-#define __ASM_SETUP_H
+#ifndef _UAPI__ASM_SETUP_H
+#define _UAPI__ASM_SETUP_H
 
 #include <linux/types.h>
 
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index a53f52ac81c6..b051367e2149 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -338,11 +338,11 @@ subsys_initcall(topology_init);
 static int dump_kernel_offset(struct notifier_block *self, unsigned long v,
 			      void *p)
 {
-	u64 const kaslr_offset = kimage_vaddr - KIMAGE_VADDR;
+	const unsigned long offset = kaslr_offset();
 
-	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_offset > 0) {
-		pr_emerg("Kernel Offset: 0x%llx from 0x%lx\n",
-			 kaslr_offset, KIMAGE_VADDR);
+	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && offset > 0) {
+		pr_emerg("Kernel Offset: 0x%lx from 0x%lx\n",
+			 offset, KIMAGE_VADDR);
 	} else {
 		pr_emerg("Kernel Offset: disabled\n");
 	}
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 3da87e198878..a8ee573fe610 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -469,6 +469,7 @@ config KEXEC
 config KEXEC_FILE
 	bool "kexec file based system call"
 	select KEXEC_CORE
+	select HAVE_IMA_KEXEC
 	select BUILD_BIN2C
 	depends on PPC64
 	depends on CRYPTO=y
diff --git a/arch/powerpc/include/asm/ima.h b/arch/powerpc/include/asm/ima.h
new file mode 100644
index 000000000000..2313bdface34
--- /dev/null
+++ b/arch/powerpc/include/asm/ima.h
@@ -0,0 +1,29 @@
+#ifndef _ASM_POWERPC_IMA_H
+#define _ASM_POWERPC_IMA_H
+
+struct kimage;
+
+int ima_get_kexec_buffer(void **addr, size_t *size);
+int ima_free_kexec_buffer(void);
+
+#ifdef CONFIG_IMA
+void remove_ima_buffer(void *fdt, int chosen_node);
+#else
+static inline void remove_ima_buffer(void *fdt, int chosen_node) {}
+#endif
+
+#ifdef CONFIG_IMA_KEXEC
+int arch_ima_add_kexec_buffer(struct kimage *image, unsigned long load_addr,
+			      size_t size);
+
+int setup_ima_buffer(const struct kimage *image, void *fdt, int chosen_node);
+#else
+static inline int setup_ima_buffer(const struct kimage *image, void *fdt,
+				   int chosen_node)
+{
+	remove_ima_buffer(fdt, chosen_node);
+	return 0;
+}
+#endif /* CONFIG_IMA_KEXEC */
+
+#endif /* _ASM_POWERPC_IMA_H */
diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index 6c3b71502fbc..25668bc8cb2a 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -94,11 +94,22 @@ static inline bool kdump_in_progress(void)
 #ifdef CONFIG_KEXEC_FILE
 extern struct kexec_file_ops kexec_elf64_ops;
 
+#ifdef CONFIG_IMA_KEXEC
+#define ARCH_HAS_KIMAGE_ARCH
+
+struct kimage_arch {
+	phys_addr_t ima_buffer_addr;
+	size_t ima_buffer_size;
+};
+#endif
+
 int setup_purgatory(struct kimage *image, const void *slave_code,
 		    const void *fdt, unsigned long kernel_load_addr,
 		    unsigned long fdt_load_addr);
-int setup_new_fdt(void *fdt, unsigned long initrd_load_addr,
-		  unsigned long initrd_len, const char *cmdline);
+int setup_new_fdt(const struct kimage *image, void *fdt,
+		  unsigned long initrd_load_addr, unsigned long initrd_len,
+		  const char *cmdline);
+int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size);
 #endif /* CONFIG_KEXEC_FILE */
 
 #else /* !CONFIG_KEXEC_CORE */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index a3a6047fd395..23f8082d7bfa 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -112,6 +112,10 @@ obj-$(CONFIG_PCI_MSI)		+= msi.o
 obj-$(CONFIG_KEXEC_CORE)	+= machine_kexec.o crash.o \
 				   machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC_FILE)	+= machine_kexec_file_$(BITS).o kexec_elf_$(BITS).o
+ifeq ($(CONFIG_HAVE_IMA_KEXEC)$(CONFIG_IMA),yy)
+obj-y				+= ima_kexec.o
+endif
+
 obj-$(CONFIG_AUDIT)		+= audit.o
 obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
diff --git a/arch/powerpc/kernel/ima_kexec.c b/arch/powerpc/kernel/ima_kexec.c
new file mode 100644
index 000000000000..5ea42c937ca9
--- /dev/null
+++ b/arch/powerpc/kernel/ima_kexec.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (C) 2016 IBM Corporation
+ *
+ * Authors:
+ * Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/of.h>
+#include <linux/memblock.h>
+#include <linux/libfdt.h>
+
+static int get_addr_size_cells(int *addr_cells, int *size_cells)
+{
+	struct device_node *root;
+
+	root = of_find_node_by_path("/");
+	if (!root)
+		return -EINVAL;
+
+	*addr_cells = of_n_addr_cells(root);
+	*size_cells = of_n_size_cells(root);
+
+	of_node_put(root);
+
+	return 0;
+}
+
+static int do_get_kexec_buffer(const void *prop, int len, unsigned long *addr,
+			       size_t *size)
+{
+	int ret, addr_cells, size_cells;
+
+	ret = get_addr_size_cells(&addr_cells, &size_cells);
+	if (ret)
+		return ret;
+
+	if (len < 4 * (addr_cells + size_cells))
+		return -ENOENT;
+
+	*addr = of_read_number(prop, addr_cells);
+	*size = of_read_number(prop + 4 * addr_cells, size_cells);
+
+	return 0;
+}
+
+/**
+ * ima_get_kexec_buffer - get IMA buffer from the previous kernel
+ * @addr:	On successful return, set to point to the buffer contents.
+ * @size:	On successful return, set to the buffer size.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int ima_get_kexec_buffer(void **addr, size_t *size)
+{
+	int ret, len;
+	unsigned long tmp_addr;
+	size_t tmp_size;
+	const void *prop;
+
+	prop = of_get_property(of_chosen, "linux,ima-kexec-buffer", &len);
+	if (!prop)
+		return -ENOENT;
+
+	ret = do_get_kexec_buffer(prop, len, &tmp_addr, &tmp_size);
+	if (ret)
+		return ret;
+
+	*addr = __va(tmp_addr);
+	*size = tmp_size;
+
+	return 0;
+}
+
+/**
+ * ima_free_kexec_buffer - free memory used by the IMA buffer
+ */
+int ima_free_kexec_buffer(void)
+{
+	int ret;
+	unsigned long addr;
+	size_t size;
+	struct property *prop;
+
+	prop = of_find_property(of_chosen, "linux,ima-kexec-buffer", NULL);
+	if (!prop)
+		return -ENOENT;
+
+	ret = do_get_kexec_buffer(prop->value, prop->length, &addr, &size);
+	if (ret)
+		return ret;
+
+	ret = of_remove_property(of_chosen, prop);
+	if (ret)
+		return ret;
+
+	return memblock_free(addr, size);
+
+}
+
+/**
+ * remove_ima_buffer - remove the IMA buffer property and reservation from @fdt
+ *
+ * The IMA measurement buffer is of no use to a subsequent kernel, so we always
+ * remove it from the device tree.
+ */
+void remove_ima_buffer(void *fdt, int chosen_node)
+{
+	int ret, len;
+	unsigned long addr;
+	size_t size;
+	const void *prop;
+
+	prop = fdt_getprop(fdt, chosen_node, "linux,ima-kexec-buffer", &len);
+	if (!prop)
+		return;
+
+	ret = do_get_kexec_buffer(prop, len, &addr, &size);
+	fdt_delprop(fdt, chosen_node, "linux,ima-kexec-buffer");
+	if (ret)
+		return;
+
+	ret = delete_fdt_mem_rsv(fdt, addr, size);
+	if (!ret)
+		pr_debug("Removed old IMA buffer reservation.\n");
+}
+
+#ifdef CONFIG_IMA_KEXEC
+/**
+ * arch_ima_add_kexec_buffer - do arch-specific steps to add the IMA buffer
+ *
+ * Architectures should use this function to pass on the IMA buffer
+ * information to the next kernel.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int arch_ima_add_kexec_buffer(struct kimage *image, unsigned long load_addr,
+			      size_t size)
+{
+	image->arch.ima_buffer_addr = load_addr;
+	image->arch.ima_buffer_size = size;
+
+	return 0;
+}
+
+static int write_number(void *p, u64 value, int cells)
+{
+	if (cells == 1) {
+		u32 tmp;
+
+		if (value > U32_MAX)
+			return -EINVAL;
+
+		tmp = cpu_to_be32(value);
+		memcpy(p, &tmp, sizeof(tmp));
+	} else if (cells == 2) {
+		u64 tmp;
+
+		tmp = cpu_to_be64(value);
+		memcpy(p, &tmp, sizeof(tmp));
+	} else
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * setup_ima_buffer - add IMA buffer information to the fdt
+ * @image:		kexec image being loaded.
+ * @fdt:		Flattened device tree for the next kernel.
+ * @chosen_node:	Offset to the chosen node.
+ *
+ * Return: 0 on success, or negative errno on error.
+ */
+int setup_ima_buffer(const struct kimage *image, void *fdt, int chosen_node)
+{
+	int ret, addr_cells, size_cells, entry_size;
+	u8 value[16];
+
+	remove_ima_buffer(fdt, chosen_node);
+	if (!image->arch.ima_buffer_size)
+		return 0;
+
+	ret = get_addr_size_cells(&addr_cells, &size_cells);
+	if (ret)
+		return ret;
+
+	entry_size = 4 * (addr_cells + size_cells);
+
+	if (entry_size > sizeof(value))
+		return -EINVAL;
+
+	ret = write_number(value, image->arch.ima_buffer_addr, addr_cells);
+	if (ret)
+		return ret;
+
+	ret = write_number(value + 4 * addr_cells, image->arch.ima_buffer_size,
+			   size_cells);
+	if (ret)
+		return ret;
+
+	ret = fdt_setprop(fdt, chosen_node, "linux,ima-kexec-buffer", value,
+			  entry_size);
+	if (ret < 0)
+		return -EINVAL;
+
+	ret = fdt_add_mem_rsv(fdt, image->arch.ima_buffer_addr,
+			      image->arch.ima_buffer_size);
+	if (ret)
+		return -EINVAL;
+
+	pr_debug("IMA buffer at 0x%llx, size = 0x%zx\n",
+		 image->arch.ima_buffer_addr, image->arch.ima_buffer_size);
+
+	return 0;
+}
+#endif /* CONFIG_IMA_KEXEC */
diff --git a/arch/powerpc/kernel/kexec_elf_64.c b/arch/powerpc/kernel/kexec_elf_64.c
index 6acffd34a70f..9a42309b091a 100644
--- a/arch/powerpc/kernel/kexec_elf_64.c
+++ b/arch/powerpc/kernel/kexec_elf_64.c
@@ -627,7 +627,7 @@ static void *elf64_load(struct kimage *image, char *kernel_buf,
 		goto out;
 	}
 
-	ret = setup_new_fdt(fdt, initrd_load_addr, initrd_len, cmdline);
+	ret = setup_new_fdt(image, fdt, initrd_load_addr, initrd_len, cmdline);
 	if (ret)
 		goto out;
 
diff --git a/arch/powerpc/kernel/machine_kexec_file_64.c b/arch/powerpc/kernel/machine_kexec_file_64.c
index 7abc8a75ee48..992c0d258e5d 100644
--- a/arch/powerpc/kernel/machine_kexec_file_64.c
+++ b/arch/powerpc/kernel/machine_kexec_file_64.c
@@ -27,6 +27,7 @@
 #include <linux/memblock.h>
 #include <linux/of_fdt.h>
 #include <linux/libfdt.h>
+#include <asm/ima.h>
 
 #define SLAVE_CODE_SIZE		256
 
@@ -180,7 +181,7 @@ int setup_purgatory(struct kimage *image, const void *slave_code,
  *
  * Return: 0 on success, or negative errno on error.
  */
-static int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size)
+int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size)
 {
 	int i, ret, num_rsvs = fdt_num_mem_rsv(fdt);
 
@@ -209,6 +210,7 @@ static int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size
 
 /*
  * setup_new_fdt - modify /chosen and memory reservation for the next kernel
+ * @image:		kexec image being loaded.
  * @fdt:		Flattened device tree for the next kernel.
  * @initrd_load_addr:	Address where the next initrd will be loaded.
  * @initrd_len:		Size of the next initrd, or 0 if there will be none.
@@ -217,8 +219,9 @@ static int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size
  *
  * Return: 0 on success, or negative errno on error.
  */
-int setup_new_fdt(void *fdt, unsigned long initrd_load_addr,
-		  unsigned long initrd_len, const char *cmdline)
+int setup_new_fdt(const struct kimage *image, void *fdt,
+		  unsigned long initrd_load_addr, unsigned long initrd_len,
+		  const char *cmdline)
 {
 	int ret, chosen_node;
 	const void *prop;
@@ -328,6 +331,12 @@ int setup_new_fdt(void *fdt, unsigned long initrd_load_addr,
 		}
 	}
 
+	ret = setup_ima_buffer(image, fdt, chosen_node);
+	if (ret) {
+		pr_err("Error setting up the new device tree.\n");
+		return ret;
+	}
+
 	ret = fdt_setprop(fdt, chosen_node, "linux,booted-from-kexec", NULL, 0);
 	if (ret) {
 		pr_err("Error setting up the new device tree.\n");
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 307b1f4543de..2e3c34b1df37 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -338,6 +338,7 @@ void arch_crash_save_vmcoreinfo(void)
 	vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
 			      kaslr_offset());
 	VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
+	VMCOREINFO_PHYS_BASE(phys_base);
 }
 
 /* arch-dependent functionality related to kexec file-based syscall */
diff --git a/block/genhd.c b/block/genhd.c
index fcd6d4fae657..a178c8e59492 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -878,7 +878,7 @@ static int show_partition(struct seq_file *seqf, void *v)
 	char buf[BDEVNAME_SIZE];
 
 	/* Don't show non-partitionable removeable devices or empty devices */
-	if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+	if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) &&
 				   (sgp->flags & GENHD_FL_REMOVABLE)))
 		return 0;
 	if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index a464c8088170..ae05b93fa0e1 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2612,20 +2612,48 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
 	spin_lock(&dlm->master_lock);
 	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
 				    namelen, target, dlm->node_num);
+	if (ret == -EEXIST) {
+		if (oldmle)
+			__dlm_put_mle(oldmle);
+
+		spin_unlock(&dlm->master_lock);
+		spin_unlock(&dlm->spinlock);
+		mlog(0, "another process is already migrating it\n");
+		goto fail;
+	}
+
+	/*
+	 * If an old mle is found, it should be put.  If its type is BLOCK,
+	 * it should be put again.  Because it has been unhasded from the map
+	 * in the function dlm_add_migration_mle.
+	 * Otherwise the memory will be leaked.  It will not be found again from
+	 * the hash map.
+	 */
+	if (oldmle) {
+		/* master is known, detach if not already detached */
+		__dlm_mle_detach_hb_events(dlm, oldmle);
+		__dlm_put_mle(oldmle);
+
+		/*
+		 * If the type of the mle is BLOCK, it should be put once for
+		 * release.  Otherwise a memory leak may be caused because
+		 * oldmle has been unhashed from the hash map and it will not
+		 * be found any more.
+		 */
+		if (oldmle->type == DLM_MLE_BLOCK)
+			__dlm_put_mle(oldmle);
+       }
+
 	/* get an extra reference on the mle.
 	 * otherwise the assert_master from the new
 	 * master will destroy this.
 	 */
 	dlm_get_mle_inuse(mle);
+	mle_added = 1;
+
 	spin_unlock(&dlm->master_lock);
 	spin_unlock(&dlm->spinlock);
 
-	if (ret == -EEXIST) {
-		mlog(0, "another process is already migrating it\n");
-		goto fail;
-	}
-	mle_added = 1;
-
 	/*
 	 * set the MIGRATING flag and flush asts
 	 * if we fail after this we need to re-dirty the lockres
@@ -2642,12 +2670,6 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
 	}
 
 fail:
-	if (ret != -EEXIST && oldmle) {
-		/* master is known, detach if not already detached */
-		dlm_mle_detach_hb_events(dlm, oldmle);
-		dlm_put_mle(oldmle);
-	}
-
 	if (ret < 0) {
 		if (mle_added) {
 			dlm_mle_detach_hb_events(dlm, mle);
@@ -3182,16 +3204,24 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
 	if (ret < 0)
 		kmem_cache_free(dlm_mle_cache, mle);
 
+	/*
+	 * If an old mle is found, it should be put.  If its type is BLOCK,
+	 * it should be put again because it has been unhashed from the map
+	 * in the dlm_add_migration_mle().
+	 * Otherwise the memory will be leaked.  It will not be found again from
+	 * the hash map.
+	 */
+	if (oldmle) {
+		__dlm_mle_detach_hb_events(dlm, oldmle);
+		__dlm_put_mle(oldmle);
+		if (ret >= 0 && oldmle->type == DLM_MLE_BLOCK)
+			__dlm_put_mle(oldmle);
+	}
+
 	spin_unlock(&dlm->master_lock);
 unlock:
 	spin_unlock(&dlm->spinlock);
 
-	if (oldmle) {
-		/* master is known, detach if not already detached */
-		dlm_mle_detach_hb_events(dlm, oldmle);
-		dlm_put_mle(oldmle);
-	}
-
 	if (res)
 		dlm_lockres_put(res);
 leave:
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 83d576f6a287..77d1632e905d 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3303,6 +3303,16 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
 	mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
 	     lockres->l_level, new_level);
 
+	/*
+	 * On DLM_LKF_VALBLK, fsdlm behaves differently with o2cb. It always
+	 * expects DLM_LKF_VALBLK being set if the LKB has LVB, so that
+	 * we can recover correctly from node failure. Otherwise, we may get
+	 * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set.
+	 */
+	if (!ocfs2_is_o2cb_active() &&
+	    lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
+		lvb = 1;
+
 	if (lvb)
 		dlm_flags |= DLM_LKF_VALBLK;
 
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 52c07346bea3..820359096c7a 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -48,6 +48,12 @@ static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
  */
 static struct ocfs2_stack_plugin *active_stack;
 
+inline int ocfs2_is_o2cb_active(void)
+{
+	return !strcmp(active_stack->sp_name, OCFS2_STACK_PLUGIN_O2CB);
+}
+EXPORT_SYMBOL_GPL(ocfs2_is_o2cb_active);
+
 static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name)
 {
 	struct ocfs2_stack_plugin *p;
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index f2dce10fae54..e3036e1790e8 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -298,6 +298,9 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p
 int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
 void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
 
+/* In ocfs2_downconvert_lock(), we need to know which stack we are using */
+int ocfs2_is_o2cb_active(void);
+
 extern struct kset *ocfs2_kset;
 
 #endif  /* STACKGLUE_H */
diff --git a/include/linux/crc64_ecma.h b/include/linux/crc64_ecma.h
new file mode 100644
index 000000000000..bba7a4d692b3
--- /dev/null
+++ b/include/linux/crc64_ecma.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CRC64_ECMA_H_
+#define __CRC64_ECMA_H_
+
+#include <linux/types.h>
+
+
+#define CRC64_DEFAULT_INITVAL           0xFFFFFFFFFFFFFFFFULL
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * @pdata:	pointer to the data to compute checksum for.
+ * @nbytes:	number of bytes in data buffer.
+ * @seed:	CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed);
+
+#endif /* __CRC64_ECMA_H_ */
diff --git a/include/linux/ima.h b/include/linux/ima.h
index 0eb7c2e7f0d6..7f6952f8d6aa 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -11,6 +11,7 @@
 #define _LINUX_IMA_H
 
 #include <linux/fs.h>
+#include <linux/kexec.h>
 struct linux_binprm;
 
 #ifdef CONFIG_IMA
@@ -23,6 +24,10 @@ extern int ima_post_read_file(struct file *file, void *buf, loff_t size,
 			      enum kernel_read_file_id id);
 extern void ima_post_path_mknod(struct dentry *dentry);
 
+#ifdef CONFIG_IMA_KEXEC
+extern void ima_add_kexec_buffer(struct kimage *image);
+#endif
+
 #else
 static inline int ima_bprm_check(struct linux_binprm *bprm)
 {
@@ -62,6 +67,13 @@ static inline void ima_post_path_mknod(struct dentry *dentry)
 
 #endif /* CONFIG_IMA */
 
+#ifndef CONFIG_IMA_KEXEC
+struct kimage;
+
+static inline void ima_add_kexec_buffer(struct kimage *image)
+{}
+#endif
+
 #ifdef CONFIG_IMA_APPRAISE
 extern void ima_inode_post_setattr(struct dentry *dentry);
 extern int ima_inode_setxattr(struct dentry *dentry, const char *xattr_name,
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index d419d0e51fe5..e98e546b543c 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -283,6 +283,8 @@ phys_addr_t paddr_vmcoreinfo_note(void);
 	vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
 #define VMCOREINFO_CONFIG(name) \
 	vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
+#define VMCOREINFO_PHYS_BASE(value) \
+	vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value)
 
 extern struct kimage *kexec_image;
 extern struct kimage *kexec_crash_image;
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index aacca824a6ae..0a3fadc32693 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -110,6 +110,7 @@ extern int watchdog_user_enabled;
 extern int watchdog_thresh;
 extern unsigned long watchdog_enabled;
 extern unsigned long *watchdog_cpumask_bits;
+extern atomic_t watchdog_park_in_progress;
 #ifdef CONFIG_SMP
 extern int sysctl_softlockup_all_cpu_backtrace;
 extern int sysctl_hardlockup_all_cpu_backtrace;
diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h
index 57c9e0622a38..56375edf2ed2 100644
--- a/include/linux/ratelimit.h
+++ b/include/linux/ratelimit.h
@@ -77,8 +77,11 @@ extern int ___ratelimit(struct ratelimit_state *rs, const char *func);
 
 #ifdef CONFIG_PRINTK
 
-#define WARN_ON_RATELIMIT(condition, state)			\
-		WARN_ON((condition) && __ratelimit(state))
+#define WARN_ON_RATELIMIT(condition, state)	({		\
+	bool __rtn_cond = !!(condition);			\
+	WARN_ON(__rtn_cond && __ratelimit(state));		\
+	__rtn_cond;						\
+})
 
 #define WARN_RATELIMIT(condition, format, ...)			\
 ({								\
diff --git a/include/linux/sem.h b/include/linux/sem.h
index d0efd6e6c20a..4fc222f8755d 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -21,7 +21,7 @@ struct sem_array {
 	struct list_head	list_id;	/* undo requests on this array */
 	int			sem_nsems;	/* no. of semaphores in array */
 	int			complex_count;	/* pending complex operations */
-	bool			complex_mode;	/* no parallel simple ops */
+	unsigned int		use_global_lock;/* >0: global lock required */
 };
 
 #ifdef CONFIG_SYSVIPC
diff --git a/ipc/sem.c b/ipc/sem.c
index e08b94851922..31eaa87ecba4 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -159,22 +159,42 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
 #define SEMOPM_FAST	64  /* ~ 372 bytes on stack */
 
 /*
+ * Switching from the mode suitable for simple ops
+ * to the mode for complex ops is costly. Therefore:
+ * use some hysteresis
+ */
+#define USE_GLOBAL_LOCK_HYSTERESIS	10
+
+/*
  * Locking:
  * a) global sem_lock() for read/write
  *	sem_undo.id_next,
  *	sem_array.complex_count,
- *	sem_array.complex_mode
  *	sem_array.pending{_alter,_const},
  *	sem_array.sem_undo
  *
  * b) global or semaphore sem_lock() for read/write:
  *	sem_array.sem_base[i].pending_{const,alter}:
- *	sem_array.complex_mode (for read)
  *
  * c) special:
  *	sem_undo_list.list_proc:
  *	* undo_list->lock for write
  *	* rcu for read
+ *	use_global_lock:
+ *	* global sem_lock() for write
+ *	* either local or global sem_lock() for read.
+ *
+ * Memory ordering:
+ * Most ordering is enforced by using spin_lock() and spin_unlock().
+ * The special case is use_global_lock:
+ * Setting it from non-zero to 0 is a RELEASE, this is ensured by
+ * using smp_store_release().
+ * Testing if it is non-zero is an ACQUIRE, this is ensured by using
+ * smp_load_acquire().
+ * Setting it from 0 to non-zero must be ordered with regards to
+ * this smp_load_acquire(), this is guaranteed because the smp_load_acquire()
+ * is inside a spin_lock() and after a write from 0 to non-zero a
+ * spin_lock()+spin_unlock() is done.
  */
 
 #define sc_semmsl	sem_ctls[0]
@@ -273,29 +293,22 @@ static void complexmode_enter(struct sem_array *sma)
 	int i;
 	struct sem *sem;
 
-	if (sma->complex_mode)  {
-		/* We are already in complex_mode. Nothing to do */
+	if (sma->use_global_lock > 0)  {
+		/*
+		 * We are already in global lock mode.
+		 * Nothing to do, just reset the
+		 * counter until we return to simple mode.
+		 */
+		sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
 		return;
 	}
-
-	/* We need a full barrier after seting complex_mode:
-	 * The write to complex_mode must be visible
-	 * before we read the first sem->lock spinlock state.
-	 */
-	smp_store_mb(sma->complex_mode, true);
+	sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
 
 	for (i = 0; i < sma->sem_nsems; i++) {
 		sem = sma->sem_base + i;
-		spin_unlock_wait(&sem->lock);
+		spin_lock(&sem->lock);
+		spin_unlock(&sem->lock);
 	}
-	/*
-	 * spin_unlock_wait() is not a memory barriers, it is only a
-	 * control barrier. The code must pair with spin_unlock(&sem->lock),
-	 * thus just the control barrier is insufficient.
-	 *
-	 * smp_rmb() is sufficient, as writes cannot pass the control barrier.
-	 */
-	smp_rmb();
 }
 
 /*
@@ -310,13 +323,17 @@ static void complexmode_tryleave(struct sem_array *sma)
 		 */
 		return;
 	}
-	/*
-	 * Immediately after setting complex_mode to false,
-	 * a simple op can start. Thus: all memory writes
-	 * performed by the current operation must be visible
-	 * before we set complex_mode to false.
-	 */
-	smp_store_release(&sma->complex_mode, false);
+	if (sma->use_global_lock == 1) {
+		/*
+		 * Immediately after setting use_global_lock to 0,
+		 * a simple op can start. Thus: all memory writes
+		 * performed by the current operation must be visible
+		 * before we set use_global_lock to 0.
+		 */
+		smp_store_release(&sma->use_global_lock, 0);
+	} else {
+		sma->use_global_lock--;
+	}
 }
 
 #define SEM_GLOBAL_LOCK	(-1)
@@ -346,30 +363,23 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
 	 * Optimized locking is possible if no complex operation
 	 * is either enqueued or processed right now.
 	 *
-	 * Both facts are tracked by complex_mode.
+	 * Both facts are tracked by use_global_mode.
 	 */
 	sem = sma->sem_base + sops->sem_num;
 
 	/*
-	 * Initial check for complex_mode. Just an optimization,
+	 * Initial check for use_global_lock. Just an optimization,
 	 * no locking, no memory barrier.
 	 */
-	if (!sma->complex_mode) {
+	if (!sma->use_global_lock) {
 		/*
 		 * It appears that no complex operation is around.
 		 * Acquire the per-semaphore lock.
 		 */
 		spin_lock(&sem->lock);
 
-		/*
-		 * See 51d7d5205d33
-		 * ("powerpc: Add smp_mb() to arch_spin_is_locked()"):
-		 * A full barrier is required: the write of sem->lock
-		 * must be visible before the read is executed
-		 */
-		smp_mb();
-
-		if (!smp_load_acquire(&sma->complex_mode)) {
+		/* pairs with smp_store_release() */
+		if (!smp_load_acquire(&sma->use_global_lock)) {
 			/* fast path successful! */
 			return sops->sem_num;
 		}
@@ -379,19 +389,26 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
 	/* slow path: acquire the full lock */
 	ipc_lock_object(&sma->sem_perm);
 
-	if (sma->complex_count == 0) {
-		/* False alarm:
-		 * There is no complex operation, thus we can switch
-		 * back to the fast path.
+	if (sma->use_global_lock == 0) {
+		/*
+		 * The use_global_lock mode ended while we waited for
+		 * sma->sem_perm.lock. Thus we must switch to locking
+		 * with sem->lock.
+		 * Unlike in the fast path, there is no need to recheck
+		 * sma->use_global_lock after we have acquired sem->lock:
+		 * We own sma->sem_perm.lock, thus use_global_lock cannot
+		 * change.
 		 */
 		spin_lock(&sem->lock);
+
 		ipc_unlock_object(&sma->sem_perm);
 		return sops->sem_num;
 	} else {
-		/* Not a false alarm, thus complete the sequence for a
-		 * full lock.
+		/*
+		 * Not a false alarm, thus continue to use the global lock
+		 * mode. No need for complexmode_enter(), this was done by
+		 * the caller that has set use_global_mode to non-zero.
 		 */
-		complexmode_enter(sma);
 		return SEM_GLOBAL_LOCK;
 	}
 }
@@ -495,7 +512,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
 	}
 
 	sma->complex_count = 0;
-	sma->complex_mode = true; /* dropped by sem_unlock below */
+	sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
 	INIT_LIST_HEAD(&sma->pending_alter);
 	INIT_LIST_HEAD(&sma->pending_const);
 	INIT_LIST_HEAD(&sma->list_id);
diff --git a/kernel/kcov.c b/kernel/kcov.c
index cc2fa35ca480..85e5546cd791 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -19,6 +19,7 @@
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/kcov.h>
+#include <asm/setup.h>
 
 /*
  * kcov descriptor (one per opened debugfs file).
@@ -73,6 +74,11 @@ void notrace __sanitizer_cov_trace_pc(void)
 	if (mode == KCOV_MODE_TRACE) {
 		unsigned long *area;
 		unsigned long pos;
+		unsigned long ip = _RET_IP_;
+
+#ifdef CONFIG_RANDOMIZE_BASE
+		ip -= kaslr_offset();
+#endif
 
 		/*
 		 * There is some code that runs in interrupts but for which
@@ -86,7 +92,7 @@ void notrace __sanitizer_cov_trace_pc(void)
 		/* The first word is number of subsequent PCs. */
 		pos = READ_ONCE(area[0]) + 1;
 		if (likely(pos < t->kcov_size)) {
-			area[pos] = _RET_IP_;
+			area[pos] = ip;
 			WRITE_ONCE(area[0], pos);
 		}
 	}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 0c2df7f73792..b56a558e406d 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -19,6 +19,7 @@
 #include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/fs.h>
+#include <linux/ima.h>
 #include <crypto/hash.h>
 #include <crypto/sha.h>
 #include <linux/syscalls.h>
@@ -132,6 +133,9 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
 		return ret;
 	image->kernel_buf_len = size;
 
+	/* IMA needs to pass the measurement list to the next kernel. */
+	ima_add_kexec_buffer(image);
+
 	/* Call arch image probe handlers */
 	ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
 					    image->kernel_buf_len);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index d4b0fa01cae3..63177be0159e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -49,6 +49,8 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
 #define for_each_watchdog_cpu(cpu) \
 	for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
 
+atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
+
 /*
  * The 'watchdog_running' variable is set to 1 when the watchdog threads
  * are registered/started and is set to 0 when the watchdog threads are
@@ -260,6 +262,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 	int duration;
 	int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
 
+	if (atomic_read(&watchdog_park_in_progress) != 0)
+		return HRTIMER_NORESTART;
+
 	/* kick the hardlockup detector */
 	watchdog_interrupt_count();
 
@@ -467,12 +472,16 @@ static int watchdog_park_threads(void)
 {
 	int cpu, ret = 0;
 
+	atomic_set(&watchdog_park_in_progress, 1);
+
 	for_each_watchdog_cpu(cpu) {
 		ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
 		if (ret)
 			break;
 	}
 
+	atomic_set(&watchdog_park_in_progress, 0);
+
 	return ret;
 }
 
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 84016c8aee6b..12b8dd640786 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -84,6 +84,9 @@ static void watchdog_overflow_callback(struct perf_event *event,
 	/* Ensure the watchdog never gets throttled */
 	event->hw.interrupts = 0;
 
+	if (atomic_read(&watchdog_park_in_progress) != 0)
+		return;
+
 	if (__this_cpu_read(watchdog_nmi_touch) == true) {
 		__this_cpu_write(watchdog_nmi_touch, false);
 		return;
diff --git a/lib/Kconfig b/lib/Kconfig
index 260a80e313b9..924697dbd367 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -185,6 +185,13 @@ config CRC8
 	  when they need to do cyclic redundancy check according CRC8
 	  algorithm. Module will be called crc8.
 
+config CRC64_ECMA
+	tristate "CRC64 ECMA function"
+	help
+	  This option provides CRC64 ECMA function. Drivers may select this
+	  when they need to do cyclic redundancy check according to the CRC64
+	  ECMA algorithm.
+
 config AUDIT_GENERIC
 	bool
 	depends on AUDIT && !AUDIT_ARCH
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 7446097f72bd..cb66a4648840 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -26,7 +26,7 @@ config CONSOLE_LOGLEVEL_DEFAULT
 	  the kernel bootargs. loglevel=<x> continues to override whatever
 	  value is specified here as well.
 
-	  Note: This does not affect the log level of un-prefixed prink()
+	  Note: This does not affect the log level of un-prefixed printk()
 	  usage in the kernel. That is controlled by the MESSAGE_LOGLEVEL_DEFAULT
 	  option.
 
diff --git a/lib/Makefile b/lib/Makefile
index 50144a3aeebd..d15e235f72ea 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -93,6 +93,7 @@ obj-$(CONFIG_CRC32)	+= crc32.o
 obj-$(CONFIG_CRC7)	+= crc7.o
 obj-$(CONFIG_LIBCRC32C)	+= libcrc32c.o
 obj-$(CONFIG_CRC8)	+= crc8.o
+obj-$(CONFIG_CRC64_ECMA)	+= crc64_ecma.o
 obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
 
 obj-$(CONFIG_842_COMPRESS) += 842/
diff --git a/lib/crc64_ecma.c b/lib/crc64_ecma.c
new file mode 100644
index 000000000000..41629ea5a60c
--- /dev/null
+++ b/lib/crc64_ecma.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/crc64_ecma.h>
+
+
+#define CRC64_BYTE_MASK			0xFF
+#define CRC64_TABLE_SIZE		256
+
+
+struct crc64_table {
+	u64 seed;
+	u64 table[CRC64_TABLE_SIZE];
+};
+
+
+static struct crc64_table CRC64_ECMA_182 = {
+	CRC64_DEFAULT_INITVAL,
+	{
+		0x0000000000000000ULL,
+		0xb32e4cbe03a75f6fULL,
+		0xf4843657a840a05bULL,
+		0x47aa7ae9abe7ff34ULL,
+		0x7bd0c384ff8f5e33ULL,
+		0xc8fe8f3afc28015cULL,
+		0x8f54f5d357cffe68ULL,
+		0x3c7ab96d5468a107ULL,
+		0xf7a18709ff1ebc66ULL,
+		0x448fcbb7fcb9e309ULL,
+		0x0325b15e575e1c3dULL,
+		0xb00bfde054f94352ULL,
+		0x8c71448d0091e255ULL,
+		0x3f5f08330336bd3aULL,
+		0x78f572daa8d1420eULL,
+		0xcbdb3e64ab761d61ULL,
+		0x7d9ba13851336649ULL,
+		0xceb5ed8652943926ULL,
+		0x891f976ff973c612ULL,
+		0x3a31dbd1fad4997dULL,
+		0x064b62bcaebc387aULL,
+		0xb5652e02ad1b6715ULL,
+		0xf2cf54eb06fc9821ULL,
+		0x41e11855055bc74eULL,
+		0x8a3a2631ae2dda2fULL,
+		0x39146a8fad8a8540ULL,
+		0x7ebe1066066d7a74ULL,
+		0xcd905cd805ca251bULL,
+		0xf1eae5b551a2841cULL,
+		0x42c4a90b5205db73ULL,
+		0x056ed3e2f9e22447ULL,
+		0xb6409f5cfa457b28ULL,
+		0xfb374270a266cc92ULL,
+		0x48190ecea1c193fdULL,
+		0x0fb374270a266cc9ULL,
+		0xbc9d3899098133a6ULL,
+		0x80e781f45de992a1ULL,
+		0x33c9cd4a5e4ecdceULL,
+		0x7463b7a3f5a932faULL,
+		0xc74dfb1df60e6d95ULL,
+		0x0c96c5795d7870f4ULL,
+		0xbfb889c75edf2f9bULL,
+		0xf812f32ef538d0afULL,
+		0x4b3cbf90f69f8fc0ULL,
+		0x774606fda2f72ec7ULL,
+		0xc4684a43a15071a8ULL,
+		0x83c230aa0ab78e9cULL,
+		0x30ec7c140910d1f3ULL,
+		0x86ace348f355aadbULL,
+		0x3582aff6f0f2f5b4ULL,
+		0x7228d51f5b150a80ULL,
+		0xc10699a158b255efULL,
+		0xfd7c20cc0cdaf4e8ULL,
+		0x4e526c720f7dab87ULL,
+		0x09f8169ba49a54b3ULL,
+		0xbad65a25a73d0bdcULL,
+		0x710d64410c4b16bdULL,
+		0xc22328ff0fec49d2ULL,
+		0x85895216a40bb6e6ULL,
+		0x36a71ea8a7ace989ULL,
+		0x0adda7c5f3c4488eULL,
+		0xb9f3eb7bf06317e1ULL,
+		0xfe5991925b84e8d5ULL,
+		0x4d77dd2c5823b7baULL,
+		0x64b62bcaebc387a1ULL,
+		0xd7986774e864d8ceULL,
+		0x90321d9d438327faULL,
+		0x231c512340247895ULL,
+		0x1f66e84e144cd992ULL,
+		0xac48a4f017eb86fdULL,
+		0xebe2de19bc0c79c9ULL,
+		0x58cc92a7bfab26a6ULL,
+		0x9317acc314dd3bc7ULL,
+		0x2039e07d177a64a8ULL,
+		0x67939a94bc9d9b9cULL,
+		0xd4bdd62abf3ac4f3ULL,
+		0xe8c76f47eb5265f4ULL,
+		0x5be923f9e8f53a9bULL,
+		0x1c4359104312c5afULL,
+		0xaf6d15ae40b59ac0ULL,
+		0x192d8af2baf0e1e8ULL,
+		0xaa03c64cb957be87ULL,
+		0xeda9bca512b041b3ULL,
+		0x5e87f01b11171edcULL,
+		0x62fd4976457fbfdbULL,
+		0xd1d305c846d8e0b4ULL,
+		0x96797f21ed3f1f80ULL,
+		0x2557339fee9840efULL,
+		0xee8c0dfb45ee5d8eULL,
+		0x5da24145464902e1ULL,
+		0x1a083bacedaefdd5ULL,
+		0xa9267712ee09a2baULL,
+		0x955cce7fba6103bdULL,
+		0x267282c1b9c65cd2ULL,
+		0x61d8f8281221a3e6ULL,
+		0xd2f6b4961186fc89ULL,
+		0x9f8169ba49a54b33ULL,
+		0x2caf25044a02145cULL,
+		0x6b055fede1e5eb68ULL,
+		0xd82b1353e242b407ULL,
+		0xe451aa3eb62a1500ULL,
+		0x577fe680b58d4a6fULL,
+		0x10d59c691e6ab55bULL,
+		0xa3fbd0d71dcdea34ULL,
+		0x6820eeb3b6bbf755ULL,
+		0xdb0ea20db51ca83aULL,
+		0x9ca4d8e41efb570eULL,
+		0x2f8a945a1d5c0861ULL,
+		0x13f02d374934a966ULL,
+		0xa0de61894a93f609ULL,
+		0xe7741b60e174093dULL,
+		0x545a57dee2d35652ULL,
+		0xe21ac88218962d7aULL,
+		0x5134843c1b317215ULL,
+		0x169efed5b0d68d21ULL,
+		0xa5b0b26bb371d24eULL,
+		0x99ca0b06e7197349ULL,
+		0x2ae447b8e4be2c26ULL,
+		0x6d4e3d514f59d312ULL,
+		0xde6071ef4cfe8c7dULL,
+		0x15bb4f8be788911cULL,
+		0xa6950335e42fce73ULL,
+		0xe13f79dc4fc83147ULL,
+		0x521135624c6f6e28ULL,
+		0x6e6b8c0f1807cf2fULL,
+		0xdd45c0b11ba09040ULL,
+		0x9aefba58b0476f74ULL,
+		0x29c1f6e6b3e0301bULL,
+		0xc96c5795d7870f42ULL,
+		0x7a421b2bd420502dULL,
+		0x3de861c27fc7af19ULL,
+		0x8ec62d7c7c60f076ULL,
+		0xb2bc941128085171ULL,
+		0x0192d8af2baf0e1eULL,
+		0x4638a2468048f12aULL,
+		0xf516eef883efae45ULL,
+		0x3ecdd09c2899b324ULL,
+		0x8de39c222b3eec4bULL,
+		0xca49e6cb80d9137fULL,
+		0x7967aa75837e4c10ULL,
+		0x451d1318d716ed17ULL,
+		0xf6335fa6d4b1b278ULL,
+		0xb199254f7f564d4cULL,
+		0x02b769f17cf11223ULL,
+		0xb4f7f6ad86b4690bULL,
+		0x07d9ba1385133664ULL,
+		0x4073c0fa2ef4c950ULL,
+		0xf35d8c442d53963fULL,
+		0xcf273529793b3738ULL,
+		0x7c0979977a9c6857ULL,
+		0x3ba3037ed17b9763ULL,
+		0x888d4fc0d2dcc80cULL,
+		0x435671a479aad56dULL,
+		0xf0783d1a7a0d8a02ULL,
+		0xb7d247f3d1ea7536ULL,
+		0x04fc0b4dd24d2a59ULL,
+		0x3886b22086258b5eULL,
+		0x8ba8fe9e8582d431ULL,
+		0xcc0284772e652b05ULL,
+		0x7f2cc8c92dc2746aULL,
+		0x325b15e575e1c3d0ULL,
+		0x8175595b76469cbfULL,
+		0xc6df23b2dda1638bULL,
+		0x75f16f0cde063ce4ULL,
+		0x498bd6618a6e9de3ULL,
+		0xfaa59adf89c9c28cULL,
+		0xbd0fe036222e3db8ULL,
+		0x0e21ac88218962d7ULL,
+		0xc5fa92ec8aff7fb6ULL,
+		0x76d4de52895820d9ULL,
+		0x317ea4bb22bfdfedULL,
+		0x8250e80521188082ULL,
+		0xbe2a516875702185ULL,
+		0x0d041dd676d77eeaULL,
+		0x4aae673fdd3081deULL,
+		0xf9802b81de97deb1ULL,
+		0x4fc0b4dd24d2a599ULL,
+		0xfceef8632775faf6ULL,
+		0xbb44828a8c9205c2ULL,
+		0x086ace348f355aadULL,
+		0x34107759db5dfbaaULL,
+		0x873e3be7d8faa4c5ULL,
+		0xc094410e731d5bf1ULL,
+		0x73ba0db070ba049eULL,
+		0xb86133d4dbcc19ffULL,
+		0x0b4f7f6ad86b4690ULL,
+		0x4ce50583738cb9a4ULL,
+		0xffcb493d702be6cbULL,
+		0xc3b1f050244347ccULL,
+		0x709fbcee27e418a3ULL,
+		0x3735c6078c03e797ULL,
+		0x841b8ab98fa4b8f8ULL,
+		0xadda7c5f3c4488e3ULL,
+		0x1ef430e13fe3d78cULL,
+		0x595e4a08940428b8ULL,
+		0xea7006b697a377d7ULL,
+		0xd60abfdbc3cbd6d0ULL,
+		0x6524f365c06c89bfULL,
+		0x228e898c6b8b768bULL,
+		0x91a0c532682c29e4ULL,
+		0x5a7bfb56c35a3485ULL,
+		0xe955b7e8c0fd6beaULL,
+		0xaeffcd016b1a94deULL,
+		0x1dd181bf68bdcbb1ULL,
+		0x21ab38d23cd56ab6ULL,
+		0x9285746c3f7235d9ULL,
+		0xd52f0e859495caedULL,
+		0x6601423b97329582ULL,
+		0xd041dd676d77eeaaULL,
+		0x636f91d96ed0b1c5ULL,
+		0x24c5eb30c5374ef1ULL,
+		0x97eba78ec690119eULL,
+		0xab911ee392f8b099ULL,
+		0x18bf525d915feff6ULL,
+		0x5f1528b43ab810c2ULL,
+		0xec3b640a391f4fadULL,
+		0x27e05a6e926952ccULL,
+		0x94ce16d091ce0da3ULL,
+		0xd3646c393a29f297ULL,
+		0x604a2087398eadf8ULL,
+		0x5c3099ea6de60cffULL,
+		0xef1ed5546e415390ULL,
+		0xa8b4afbdc5a6aca4ULL,
+		0x1b9ae303c601f3cbULL,
+		0x56ed3e2f9e224471ULL,
+		0xe5c372919d851b1eULL,
+		0xa26908783662e42aULL,
+		0x114744c635c5bb45ULL,
+		0x2d3dfdab61ad1a42ULL,
+		0x9e13b115620a452dULL,
+		0xd9b9cbfcc9edba19ULL,
+		0x6a978742ca4ae576ULL,
+		0xa14cb926613cf817ULL,
+		0x1262f598629ba778ULL,
+		0x55c88f71c97c584cULL,
+		0xe6e6c3cfcadb0723ULL,
+		0xda9c7aa29eb3a624ULL,
+		0x69b2361c9d14f94bULL,
+		0x2e184cf536f3067fULL,
+		0x9d36004b35545910ULL,
+		0x2b769f17cf112238ULL,
+		0x9858d3a9ccb67d57ULL,
+		0xdff2a94067518263ULL,
+		0x6cdce5fe64f6dd0cULL,
+		0x50a65c93309e7c0bULL,
+		0xe388102d33392364ULL,
+		0xa4226ac498dedc50ULL,
+		0x170c267a9b79833fULL,
+		0xdcd7181e300f9e5eULL,
+		0x6ff954a033a8c131ULL,
+		0x28532e49984f3e05ULL,
+		0x9b7d62f79be8616aULL,
+		0xa707db9acf80c06dULL,
+		0x14299724cc279f02ULL,
+		0x5383edcd67c06036ULL,
+		0xe0ada17364673f59ULL
+	}
+};
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void)
+{
+	return CRC64_ECMA_182.seed;
+}
+EXPORT_SYMBOL(crc64_ecma_seed);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * pdata: pointer to the data to compute checksum for.
+ * nbytes: number of bytes in data buffer.
+ * seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed)
+{
+	unsigned int i;
+	u64 crc = seed;
+
+	for (i = 0; i < nbytes; i++)
+		crc = CRC64_ECMA_182.table[(crc ^ pdata[i]) & CRC64_BYTE_MASK] ^
+			(crc >> 8);
+
+	return crc;
+}
+EXPORT_SYMBOL(crc64_ecma);
+
+MODULE_DESCRIPTION("CRC64 ECMA function");
+MODULE_AUTHOR("Freescale Semiconductor Inc.");
+MODULE_LICENSE("GPL");
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 6c707bfe02fd..a43013112581 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -139,7 +139,20 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 		}
 
 		if (end_index >= start_index) {
-			unsigned long count = invalidate_mapping_pages(mapping,
+			unsigned long count;
+
+			/*
+			 * It's common to FADV_DONTNEED right after
+			 * the read or write that instantiates the
+			 * pages, in which case there will be some
+			 * sitting on the local LRU cache. Try to
+			 * avoid the expensive remote drain and the
+			 * second cache tree walk below by flushing
+			 * them out right away.
+			 */
+			lru_add_drain();
+
+			count = invalidate_mapping_pages(mapping,
 						start_index, end_index);
 
 			/*
diff --git a/mm/internal.h b/mm/internal.h
index 44d68895a9b9..ebb3cbd21937 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -131,9 +131,9 @@ struct alloc_context {
  * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
  */
 static inline unsigned long
-__find_buddy_index(unsigned long page_idx, unsigned int order)
+__find_buddy_pfn(unsigned long page_pfn, unsigned int order)
 {
-	return page_idx ^ (1 << order);
+	return page_pfn ^ (1 << order);
 }
 
 extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index e32389a97030..b0924a68cc36 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1242,7 +1242,6 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 	struct vm_area_struct *vma;
 	unsigned long addr;
 	pmd_t *pmd, _pmd;
-	bool deposited = false;
 
 	i_mmap_lock_write(mapping);
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
@@ -1267,26 +1266,10 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 			spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
 			/* assume page table is clear */
 			_pmd = pmdp_collapse_flush(vma, addr, pmd);
-			/*
-			 * now deposit the pgtable for arch that need it
-			 * otherwise free it.
-			 */
-			if (arch_needs_pgtable_deposit()) {
-				/*
-				 * The deposit should be visibile only after
-				 * collapse is seen by others.
-				 */
-				smp_wmb();
-				pgtable_trans_huge_deposit(vma->vm_mm, pmd,
-							   pmd_pgtable(_pmd));
-				deposited = true;
-			}
 			spin_unlock(ptl);
 			up_write(&vma->vm_mm->mmap_sem);
-			if (!deposited) {
-				atomic_long_dec(&vma->vm_mm->nr_ptes);
-				pte_free(vma->vm_mm, pmd_pgtable(_pmd));
-			}
+			atomic_long_dec(&vma->vm_mm->nr_ptes);
+			pte_free(vma->vm_mm, pmd_pgtable(_pmd));
 		}
 	}
 	i_mmap_unlock_write(mapping);
diff --git a/mm/memory.c b/mm/memory.c
index 455c3e628d52..36c774f9259e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3008,13 +3008,6 @@ static int do_set_pmd(struct vm_fault *vmf, struct page *page)
 	ret = 0;
 	count_vm_event(THP_FILE_MAPPED);
 out:
-	/*
-	 * If we are going to fallback to pte mapping, do a
-	 * withdraw with pmd lock held.
-	 */
-	if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK)
-		vmf->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm,
-								vmf->pmd);
 	spin_unlock(vmf->ptl);
 	return ret;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2c6d5f64feca..ef731757b80f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -714,7 +714,7 @@ static inline void rmv_page_order(struct page *page)
 /*
  * This function checks whether a page is free && is the buddy
  * we can do coalesce a page and its buddy if
- * (a) the buddy is not in a hole &&
+ * (a) the buddy is not in a hole (check before calling!) &&
  * (b) the buddy is in the buddy system &&
  * (c) a page and its buddy have the same order &&
  * (d) a page and its buddy are in the same zone.
@@ -729,9 +729,6 @@ static inline void rmv_page_order(struct page *page)
 static inline int page_is_buddy(struct page *page, struct page *buddy,
 							unsigned int order)
 {
-	if (!pfn_valid_within(page_to_pfn(buddy)))
-		return 0;
-
 	if (page_is_guard(buddy) && page_order(buddy) == order) {
 		if (page_zone_id(page) != page_zone_id(buddy))
 			return 0;
@@ -787,9 +784,8 @@ static inline void __free_one_page(struct page *page,
 		struct zone *zone, unsigned int order,
 		int migratetype)
 {
-	unsigned long page_idx;
-	unsigned long combined_idx;
-	unsigned long uninitialized_var(buddy_idx);
+	unsigned long combined_pfn;
+	unsigned long uninitialized_var(buddy_pfn);
 	struct page *buddy;
 	unsigned int max_order;
 
@@ -802,15 +798,16 @@ static inline void __free_one_page(struct page *page,
 	if (likely(!is_migrate_isolate(migratetype)))
 		__mod_zone_freepage_state(zone, 1 << order, migratetype);
 
-	page_idx = pfn & ((1 << MAX_ORDER) - 1);
-
-	VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
+	VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
 	VM_BUG_ON_PAGE(bad_range(zone, page), page);
 
 continue_merging:
 	while (order < max_order - 1) {
-		buddy_idx = __find_buddy_index(page_idx, order);
-		buddy = page + (buddy_idx - page_idx);
+		buddy_pfn = __find_buddy_pfn(pfn, order);
+		buddy = page + (buddy_pfn - pfn);
+
+		if (!pfn_valid_within(buddy_pfn))
+			goto done_merging;
 		if (!page_is_buddy(page, buddy, order))
 			goto done_merging;
 		/*
@@ -824,9 +821,9 @@ continue_merging:
 			zone->free_area[order].nr_free--;
 			rmv_page_order(buddy);
 		}
-		combined_idx = buddy_idx & page_idx;
-		page = page + (combined_idx - page_idx);
-		page_idx = combined_idx;
+		combined_pfn = buddy_pfn & pfn;
+		page = page + (combined_pfn - pfn);
+		pfn = combined_pfn;
 		order++;
 	}
 	if (max_order < MAX_ORDER) {
@@ -841,8 +838,8 @@ continue_merging:
 		if (unlikely(has_isolate_pageblock(zone))) {
 			int buddy_mt;
 
-			buddy_idx = __find_buddy_index(page_idx, order);
-			buddy = page + (buddy_idx - page_idx);
+			buddy_pfn = __find_buddy_pfn(pfn, order);
+			buddy = page + (buddy_pfn - pfn);
 			buddy_mt = get_pageblock_migratetype(buddy);
 
 			if (migratetype != buddy_mt
@@ -865,12 +862,12 @@ done_merging:
 	 * so it's less likely to be used soon and more likely to be merged
 	 * as a higher order page
 	 */
-	if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
+	if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
 		struct page *higher_page, *higher_buddy;
-		combined_idx = buddy_idx & page_idx;
-		higher_page = page + (combined_idx - page_idx);
-		buddy_idx = __find_buddy_index(combined_idx, order + 1);
-		higher_buddy = higher_page + (buddy_idx - combined_idx);
+		combined_pfn = buddy_pfn & pfn;
+		higher_page = page + (combined_pfn - pfn);
+		buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
+		higher_buddy = higher_page + (buddy_pfn - combined_pfn);
 		if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
 			list_add_tail(&page->lru,
 				&zone->free_area[order].free_list[migratetype]);
@@ -2603,6 +2600,9 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
 	if (z->node == local_nid) {
 		__inc_zone_state(z, NUMA_HIT);
 		__inc_zone_state(z, local_stat);
+	} else if (z->node == preferred_zone->node) {
+		__inc_zone_state(z, NUMA_HIT);
+		__inc_zone_state(z, NUMA_OTHER);
 	} else {
 		__inc_zone_state(z, NUMA_MISS);
 		__inc_zone_state(preferred_zone, NUMA_FOREIGN);
@@ -3015,18 +3015,12 @@ static inline bool should_suppress_show_mem(void)
 	return ret;
 }
 
-static DEFINE_RATELIMIT_STATE(nopage_rs,
-		DEFAULT_RATELIMIT_INTERVAL,
-		DEFAULT_RATELIMIT_BURST);
-
-void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
+static void warn_alloc_show_mem(gfp_t gfp_mask)
 {
 	unsigned int filter = SHOW_MEM_FILTER_NODES;
-	struct va_format vaf;
-	va_list args;
+	static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
 
-	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
-	    debug_guardpage_minorder() > 0)
+	if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs))
 		return;
 
 	/*
@@ -3041,6 +3035,20 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
 	if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
 		filter &= ~SHOW_MEM_FILTER_NODES;
 
+	show_mem(filter);
+}
+
+void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+	static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
+				      DEFAULT_RATELIMIT_BURST);
+
+	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
+	    debug_guardpage_minorder() > 0)
+		return;
+
 	pr_warn("%s: ", current->comm);
 
 	va_start(args, fmt);
@@ -3052,8 +3060,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
 	pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask);
 
 	dump_stack();
-	if (!should_suppress_show_mem())
-		show_mem(filter);
+	warn_alloc_show_mem(gfp_mask);
 }
 
 static inline struct page *
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index a5594bfcc5ed..f4e17a57926a 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -83,7 +83,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 	unsigned long flags, nr_pages;
 	bool isolated_page = false;
 	unsigned int order;
-	unsigned long page_idx, buddy_idx;
+	unsigned long pfn, buddy_pfn;
 	struct page *buddy;
 
 	zone = page_zone(page);
@@ -102,11 +102,11 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 	if (PageBuddy(page)) {
 		order = page_order(page);
 		if (order >= pageblock_order) {
-			page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
-			buddy_idx = __find_buddy_index(page_idx, order);
-			buddy = page + (buddy_idx - page_idx);
+			pfn = page_to_pfn(page);
+			buddy_pfn = __find_buddy_pfn(pfn, order);
+			buddy = page + (buddy_pfn - pfn);
 
-			if (pfn_valid_within(page_to_pfn(buddy)) &&
+			if (pfn_valid_within(buddy_pfn) &&
 			    !is_migrate_isolate_page(buddy)) {
 				__isolate_free_page(page, order);
 				isolated_page = true;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 60634dc53a88..c3cee247f2e6 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -261,7 +261,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
 	 */
 	for (; pfn < end_pfn; ) {
 		if (!pfn_valid(pfn)) {
-			pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+			pfn = ALIGN(pfn + 1, pageblock_nr_pages);
 			continue;
 		}
 
@@ -527,7 +527,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
 	 */
 	for (; pfn < end_pfn; ) {
 		if (!pfn_valid(pfn)) {
-			pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+			pfn = ALIGN(pfn + 1, pageblock_nr_pages);
 			continue;
 		}
 
diff --git a/mm/shmem.c b/mm/shmem.c
index b1b20dc63265..354c340ee505 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2168,10 +2168,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 
 bool shmem_mapping(struct address_space *mapping)
 {
-	if (!mapping->host)
-		return false;
-
-	return mapping->host->i_sb->s_op == &shmem_ops;
+	return mapping->a_ops == &shmem_aops;
 }
 
 #ifdef CONFIG_TMPFS
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 8f9e89ca1d31..0ef0d4039066 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -34,28 +34,59 @@
 /*****************
  * Structures
 *****************/
+struct z3fold_pool;
+struct z3fold_ops {
+	int (*evict)(struct z3fold_pool *pool, unsigned long handle);
+};
+
+enum buddy {
+	HEADLESS = 0,
+	FIRST,
+	MIDDLE,
+	LAST,
+	BUDDIES_MAX
+};
+
+/*
+ * struct z3fold_header - z3fold page metadata occupying the first chunk of each
+ *			z3fold page, except for HEADLESS pages
+ * @buddy:	links the z3fold page into the relevant list in the pool
+ * @page_lock:		per-page lock
+ * @first_chunks:	the size of the first buddy in chunks, 0 if free
+ * @middle_chunks:	the size of the middle buddy in chunks, 0 if free
+ * @last_chunks:	the size of the last buddy in chunks, 0 if free
+ * @first_num:		the starting number (for the first handle)
+ */
+struct z3fold_header {
+	struct list_head buddy;
+	raw_spinlock_t page_lock;
+	unsigned short first_chunks;
+	unsigned short middle_chunks;
+	unsigned short last_chunks;
+	unsigned short start_middle;
+	unsigned short first_num:2;
+};
+
 /*
  * NCHUNKS_ORDER determines the internal allocation granularity, effectively
  * adjusting internal fragmentation.  It also determines the number of
  * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
- * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk
- * in allocated page is occupied by z3fold header, NCHUNKS will be calculated
- * to 63 which shows the max number of free chunks in z3fold page, also there
- * will be 63 freelists per pool.
+ * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks
+ * in the beginning of an allocated page are occupied by z3fold header, so
+ * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y),
+ * which shows the max number of free chunks in z3fold page, also there will
+ * be 63, or 62, respectively, freelists per pool.
  */
 #define NCHUNKS_ORDER	6
 
 #define CHUNK_SHIFT	(PAGE_SHIFT - NCHUNKS_ORDER)
 #define CHUNK_SIZE	(1 << CHUNK_SHIFT)
-#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
+#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE)
+#define ZHDR_CHUNKS	(ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT)
+#define TOTAL_CHUNKS	(PAGE_SIZE >> CHUNK_SHIFT)
 #define NCHUNKS		((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
 
-#define BUDDY_MASK	((1 << NCHUNKS_ORDER) - 1)
-
-struct z3fold_pool;
-struct z3fold_ops {
-	int (*evict)(struct z3fold_pool *pool, unsigned long handle);
-};
+#define BUDDY_MASK	(0x3)
 
 /**
  * struct z3fold_pool - stores metadata for each z3fold pool
@@ -80,37 +111,12 @@ struct z3fold_pool {
 	struct list_head unbuddied[NCHUNKS];
 	struct list_head buddied;
 	struct list_head lru;
-	u64 pages_nr;
+	atomic64_t pages_nr;
 	const struct z3fold_ops *ops;
 	struct zpool *zpool;
 	const struct zpool_ops *zpool_ops;
 };
 
-enum buddy {
-	HEADLESS = 0,
-	FIRST,
-	MIDDLE,
-	LAST,
-	BUDDIES_MAX
-};
-
-/*
- * struct z3fold_header - z3fold page metadata occupying the first chunk of each
- *			z3fold page, except for HEADLESS pages
- * @buddy:	links the z3fold page into the relevant list in the pool
- * @first_chunks:	the size of the first buddy in chunks, 0 if free
- * @middle_chunks:	the size of the middle buddy in chunks, 0 if free
- * @last_chunks:	the size of the last buddy in chunks, 0 if free
- * @first_num:		the starting number (for the first handle)
- */
-struct z3fold_header {
-	struct list_head buddy;
-	unsigned short first_chunks;
-	unsigned short middle_chunks;
-	unsigned short last_chunks;
-	unsigned short start_middle;
-	unsigned short first_num:NCHUNKS_ORDER;
-};
 
 /*
  * Internal z3fold page flags
@@ -121,6 +127,7 @@ enum z3fold_page_flags {
 	MIDDLE_CHUNK_MAPPED,
 };
 
+
 /*****************
  * Helpers
 *****************/
@@ -144,6 +151,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page)
 	clear_bit(PAGE_HEADLESS, &page->private);
 	clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
 
+	raw_spin_lock_init(&zhdr->page_lock);
 	zhdr->first_chunks = 0;
 	zhdr->middle_chunks = 0;
 	zhdr->last_chunks = 0;
@@ -159,6 +167,19 @@ static void free_z3fold_page(struct z3fold_header *zhdr)
 	__free_page(virt_to_page(zhdr));
 }
 
+/* Lock a z3fold page */
+static inline void z3fold_page_lock(struct z3fold_header *zhdr)
+{
+	raw_spin_lock(&zhdr->page_lock);
+}
+
+/* Unlock a z3fold page */
+static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
+{
+	raw_spin_unlock(&zhdr->page_lock);
+}
+
+
 /*
  * Encodes the handle of a particular buddy within a z3fold page
  * Pool lock should be held as this function accesses first_num
@@ -179,7 +200,11 @@ static struct z3fold_header *handle_to_z3fold_header(unsigned long handle)
 	return (struct z3fold_header *)(handle & PAGE_MASK);
 }
 
-/* Returns buddy number */
+/*
+ * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle
+ *  but that doesn't matter. because the masking will result in the
+ *  correct buddy number.
+ */
 static enum buddy handle_to_buddy(unsigned long handle)
 {
 	struct z3fold_header *zhdr = handle_to_z3fold_header(handle);
@@ -200,9 +225,10 @@ static int num_free_chunks(struct z3fold_header *zhdr)
 	 */
 	if (zhdr->middle_chunks != 0) {
 		int nfree_before = zhdr->first_chunks ?
-			0 : zhdr->start_middle - 1;
+			0 : zhdr->start_middle - ZHDR_CHUNKS;
 		int nfree_after = zhdr->last_chunks ?
-			0 : NCHUNKS - zhdr->start_middle - zhdr->middle_chunks;
+			0 : TOTAL_CHUNKS -
+				(zhdr->start_middle + zhdr->middle_chunks);
 		nfree = max(nfree_before, nfree_after);
 	} else
 		nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
@@ -234,7 +260,7 @@ static struct z3fold_pool *z3fold_create_pool(gfp_t gfp,
 		INIT_LIST_HEAD(&pool->unbuddied[i]);
 	INIT_LIST_HEAD(&pool->buddied);
 	INIT_LIST_HEAD(&pool->lru);
-	pool->pages_nr = 0;
+	atomic64_set(&pool->pages_nr, 0);
 	pool->ops = ops;
 	return pool;
 }
@@ -250,26 +276,67 @@ static void z3fold_destroy_pool(struct z3fold_pool *pool)
 	kfree(pool);
 }
 
+static inline void *mchunk_memmove(struct z3fold_header *zhdr,
+				unsigned short dst_chunk)
+{
+	void *beg = zhdr;
+	return memmove(beg + (dst_chunk << CHUNK_SHIFT),
+		       beg + (zhdr->start_middle << CHUNK_SHIFT),
+		       zhdr->middle_chunks << CHUNK_SHIFT);
+}
+
+#define BIG_CHUNK_GAP	3
 /* Has to be called with lock held */
 static int z3fold_compact_page(struct z3fold_header *zhdr)
 {
 	struct page *page = virt_to_page(zhdr);
-	void *beg = zhdr;
+	int ret = 0;
+
+	if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private))
+		goto out; /* can't move middle chunk, it's used */
 
+	if (zhdr->middle_chunks == 0)
+		goto out; /* nothing to compact */
 
-	if (!test_bit(MIDDLE_CHUNK_MAPPED, &page->private) &&
-	    zhdr->middle_chunks != 0 &&
-	    zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
-		memmove(beg + ZHDR_SIZE_ALIGNED,
-			beg + (zhdr->start_middle << CHUNK_SHIFT),
-			zhdr->middle_chunks << CHUNK_SHIFT);
+	if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
+		/* move to the beginning */
+		mchunk_memmove(zhdr, ZHDR_CHUNKS);
 		zhdr->first_chunks = zhdr->middle_chunks;
 		zhdr->middle_chunks = 0;
 		zhdr->start_middle = 0;
 		zhdr->first_num++;
-		return 1;
+		ret = 1;
+		goto out;
 	}
-	return 0;
+
+	/*
+	 * moving data is expensive, so let's only do that if
+	 * there's substantial gain (at least BIG_CHUNK_GAP chunks)
+	 */
+	if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 &&
+	    zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >=
+	    BIG_CHUNK_GAP) {
+		/* new_start: right after 1st chunk */
+		unsigned short new_start = zhdr->first_chunks + ZHDR_CHUNKS;
+		mchunk_memmove(zhdr, new_start);
+		zhdr->start_middle = new_start;
+		ret = 1;
+		goto out;
+	}
+	if (zhdr->last_chunks != 0 && zhdr->first_chunks == 0 &&
+	    TOTAL_CHUNKS -
+	    (zhdr->last_chunks + zhdr->start_middle + zhdr->middle_chunks) >=
+	    BIG_CHUNK_GAP) {
+		/* new_start: right before last chunk */
+		unsigned short new_start = TOTAL_CHUNKS -
+			(zhdr->last_chunks + zhdr->middle_chunks);
+		mchunk_memmove(zhdr, new_start);
+		zhdr->start_middle = new_start;
+		ret = 1;
+		goto out;
+	}
+out:
+	return ret;
 }
 
 /**
@@ -309,50 +376,60 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
 		bud = HEADLESS;
 	else {
 		chunks = size_to_chunks(size);
-		spin_lock(&pool->lock);
 
 		/* First, try to find an unbuddied z3fold page. */
 		zhdr = NULL;
 		for_each_unbuddied_list(i, chunks) {
-			if (!list_empty(&pool->unbuddied[i])) {
-				zhdr = list_first_entry(&pool->unbuddied[i],
+			spin_lock(&pool->lock);
+			zhdr = list_first_entry_or_null(&pool->unbuddied[i],
 						struct z3fold_header, buddy);
-				page = virt_to_page(zhdr);
-				if (zhdr->first_chunks == 0) {
-					if (zhdr->middle_chunks != 0 &&
-					    chunks >= zhdr->start_middle)
-						bud = LAST;
-					else
-						bud = FIRST;
-				} else if (zhdr->last_chunks == 0)
+			if (!zhdr) {
+				spin_unlock(&pool->lock);
+				continue;
+			}
+			list_del_init(&zhdr->buddy);
+			spin_unlock(&pool->lock);
+
+			page = virt_to_page(zhdr);
+			z3fold_page_lock(zhdr);
+			if (zhdr->first_chunks == 0) {
+				if (zhdr->middle_chunks != 0 &&
+				    chunks >= zhdr->start_middle)
 					bud = LAST;
-				else if (zhdr->middle_chunks == 0)
-					bud = MIDDLE;
-				else {
-					pr_err("No free chunks in unbuddied\n");
-					WARN_ON(1);
-					continue;
-				}
-				list_del(&zhdr->buddy);
-				goto found;
+				else
+					bud = FIRST;
+			} else if (zhdr->last_chunks == 0)
+				bud = LAST;
+			else if (zhdr->middle_chunks == 0)
+				bud = MIDDLE;
+			else {
+				spin_lock(&pool->lock);
+				list_add(&zhdr->buddy, &pool->buddied);
+				spin_unlock(&pool->lock);
+				z3fold_page_unlock(zhdr);
+				pr_err("No free chunks in unbuddied\n");
+				WARN_ON(1);
+				continue;
 			}
+			goto found;
 		}
 		bud = FIRST;
-		spin_unlock(&pool->lock);
 	}
 
 	/* Couldn't find unbuddied z3fold page, create new one */
 	page = alloc_page(gfp);
 	if (!page)
 		return -ENOMEM;
-	spin_lock(&pool->lock);
-	pool->pages_nr++;
+
+	atomic64_inc(&pool->pages_nr);
 	zhdr = init_z3fold_page(page);
 
 	if (bud == HEADLESS) {
 		set_bit(PAGE_HEADLESS, &page->private);
+		spin_lock(&pool->lock);
 		goto headless;
 	}
+	z3fold_page_lock(zhdr);
 
 found:
 	if (bud == FIRST)
@@ -361,9 +438,10 @@ found:
 		zhdr->last_chunks = chunks;
 	else {
 		zhdr->middle_chunks = chunks;
-		zhdr->start_middle = zhdr->first_chunks + 1;
+		zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
 	}
 
+	spin_lock(&pool->lock);
 	if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
 			zhdr->middle_chunks == 0) {
 		/* Add to unbuddied list */
@@ -383,6 +461,8 @@ headless:
 
 	*handle = encode_handle(zhdr, bud);
 	spin_unlock(&pool->lock);
+	if (bud != HEADLESS)
+		z3fold_page_unlock(zhdr);
 
 	return 0;
 }
@@ -404,7 +484,6 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
 	struct page *page;
 	enum buddy bud;
 
-	spin_lock(&pool->lock);
 	zhdr = handle_to_z3fold_header(handle);
 	page = virt_to_page(zhdr);
 
@@ -412,6 +491,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
 		/* HEADLESS page stored */
 		bud = HEADLESS;
 	} else {
+		z3fold_page_lock(zhdr);
 		bud = handle_to_buddy(handle);
 
 		switch (bud) {
@@ -428,38 +508,67 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
 		default:
 			pr_err("%s: unknown bud %d\n", __func__, bud);
 			WARN_ON(1);
-			spin_unlock(&pool->lock);
+			z3fold_page_unlock(zhdr);
 			return;
 		}
 	}
 
 	if (test_bit(UNDER_RECLAIM, &page->private)) {
 		/* z3fold page is under reclaim, reclaim will free */
-		spin_unlock(&pool->lock);
+		if (bud != HEADLESS)
+			z3fold_page_unlock(zhdr);
 		return;
 	}
 
+	/* Remove from existing buddy list */
 	if (bud != HEADLESS) {
-		/* Remove from existing buddy list */
-		list_del(&zhdr->buddy);
+		spin_lock(&pool->lock);
+		/*
+		 * this object may have been removed from its list by
+		 * z3fold_alloc(). In that case we just do nothing,
+		 * z3fold_alloc() will allocate an object and add the page
+		 * to the relevant list.
+		 */
+		if (!list_empty(&zhdr->buddy)) {
+			list_del(&zhdr->buddy);
+		} else {
+			spin_unlock(&pool->lock);
+			z3fold_page_unlock(zhdr);
+			return;
+		}
+		spin_unlock(&pool->lock);
 	}
 
 	if (bud == HEADLESS ||
 	    (zhdr->first_chunks == 0 && zhdr->middle_chunks == 0 &&
 			zhdr->last_chunks == 0)) {
 		/* z3fold page is empty, free */
+		spin_lock(&pool->lock);
 		list_del(&page->lru);
+		spin_unlock(&pool->lock);
 		clear_bit(PAGE_HEADLESS, &page->private);
+		if (bud != HEADLESS)
+			z3fold_page_unlock(zhdr);
 		free_z3fold_page(zhdr);
-		pool->pages_nr--;
+		atomic64_dec(&pool->pages_nr);
 	} else {
-		z3fold_compact_page(zhdr);
+		int compacted = z3fold_compact_page(zhdr);
 		/* Add to the unbuddied list */
+		spin_lock(&pool->lock);
 		freechunks = num_free_chunks(zhdr);
-		list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+		/*
+		 * If the page has been compacted, we want to use it
+		 * in the first place.
+		 */
+		if (compacted)
+			list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+		else
+			list_add_tail(&zhdr->buddy,
+				      &pool->unbuddied[freechunks]);
+		spin_unlock(&pool->lock);
+		z3fold_page_unlock(zhdr);
 	}
 
-	spin_unlock(&pool->lock);
 }
 
 /**
@@ -506,12 +615,15 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
 	unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
 
 	spin_lock(&pool->lock);
-	if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
-			retries == 0) {
+	if (!pool->ops || !pool->ops->evict || retries == 0) {
 		spin_unlock(&pool->lock);
 		return -EINVAL;
 	}
 	for (i = 0; i < retries; i++) {
+		if (list_empty(&pool->lru)) {
+			spin_unlock(&pool->lock);
+			return -EINVAL;
+		}
 		page = list_last_entry(&pool->lru, struct page, lru);
 		list_del(&page->lru);
 
@@ -520,6 +632,8 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
 		zhdr = page_address(page);
 		if (!test_bit(PAGE_HEADLESS, &page->private)) {
 			list_del(&zhdr->buddy);
+			spin_unlock(&pool->lock);
+			z3fold_page_lock(zhdr);
 			/*
 			 * We need encode the handles before unlocking, since
 			 * we can race with free that will set
@@ -534,13 +648,13 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
 				middle_handle = encode_handle(zhdr, MIDDLE);
 			if (zhdr->last_chunks)
 				last_handle = encode_handle(zhdr, LAST);
+			z3fold_page_unlock(zhdr);
 		} else {
 			first_handle = encode_handle(zhdr, HEADLESS);
 			last_handle = middle_handle = 0;
+			spin_unlock(&pool->lock);
 		}
 
-		spin_unlock(&pool->lock);
-
 		/* Issue the eviction callback(s) */
 		if (middle_handle) {
 			ret = pool->ops->evict(pool, middle_handle);
@@ -558,7 +672,8 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
 				goto next;
 		}
 next:
-		spin_lock(&pool->lock);
+		if (!test_bit(PAGE_HEADLESS, &page->private))
+			z3fold_page_lock(zhdr);
 		clear_bit(UNDER_RECLAIM, &page->private);
 		if ((test_bit(PAGE_HEADLESS, &page->private) && ret == 0) ||
 		    (zhdr->first_chunks == 0 && zhdr->last_chunks == 0 &&
@@ -567,26 +682,38 @@ next:
 			 * All buddies are now free, free the z3fold page and
 			 * return success.
 			 */
-			clear_bit(PAGE_HEADLESS, &page->private);
+			if (!test_and_clear_bit(PAGE_HEADLESS, &page->private))
+				z3fold_page_unlock(zhdr);
 			free_z3fold_page(zhdr);
-			pool->pages_nr--;
-			spin_unlock(&pool->lock);
+			atomic64_dec(&pool->pages_nr);
 			return 0;
 		}  else if (!test_bit(PAGE_HEADLESS, &page->private)) {
 			if (zhdr->first_chunks != 0 &&
 			    zhdr->last_chunks != 0 &&
 			    zhdr->middle_chunks != 0) {
 				/* Full, add to buddied list */
+				spin_lock(&pool->lock);
 				list_add(&zhdr->buddy, &pool->buddied);
+				spin_unlock(&pool->lock);
 			} else {
-				z3fold_compact_page(zhdr);
+				int compacted = z3fold_compact_page(zhdr);
 				/* add to unbuddied list */
+				spin_lock(&pool->lock);
 				freechunks = num_free_chunks(zhdr);
-				list_add(&zhdr->buddy,
-					 &pool->unbuddied[freechunks]);
+				if (compacted)
+					list_add(&zhdr->buddy,
+						&pool->unbuddied[freechunks]);
+				else
+					list_add_tail(&zhdr->buddy,
+						&pool->unbuddied[freechunks]);
+				spin_unlock(&pool->lock);
 			}
 		}
 
+		if (!test_bit(PAGE_HEADLESS, &page->private))
+			z3fold_page_unlock(zhdr);
+
+		spin_lock(&pool->lock);
 		/* add to beginning of LRU */
 		list_add(&page->lru, &pool->lru);
 	}
@@ -611,7 +738,6 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
 	void *addr;
 	enum buddy buddy;
 
-	spin_lock(&pool->lock);
 	zhdr = handle_to_z3fold_header(handle);
 	addr = zhdr;
 	page = virt_to_page(zhdr);
@@ -619,6 +745,7 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
 	if (test_bit(PAGE_HEADLESS, &page->private))
 		goto out;
 
+	z3fold_page_lock(zhdr);
 	buddy = handle_to_buddy(handle);
 	switch (buddy) {
 	case FIRST:
@@ -637,8 +764,9 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
 		addr = NULL;
 		break;
 	}
+
+	z3fold_page_unlock(zhdr);
 out:
-	spin_unlock(&pool->lock);
 	return addr;
 }
 
@@ -653,31 +781,28 @@ static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
 	struct page *page;
 	enum buddy buddy;
 
-	spin_lock(&pool->lock);
 	zhdr = handle_to_z3fold_header(handle);
 	page = virt_to_page(zhdr);
 
-	if (test_bit(PAGE_HEADLESS, &page->private)) {
-		spin_unlock(&pool->lock);
+	if (test_bit(PAGE_HEADLESS, &page->private))
 		return;
-	}
 
+	z3fold_page_lock(zhdr);
 	buddy = handle_to_buddy(handle);
 	if (buddy == MIDDLE)
 		clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
-	spin_unlock(&pool->lock);
+	z3fold_page_unlock(zhdr);
 }
 
 /**
  * z3fold_get_pool_size() - gets the z3fold pool size in pages
  * @pool:	pool whose size is being queried
  *
- * Returns: size in pages of the given pool.  The pool lock need not be
- * taken to access pages_nr.
+ * Returns: size in pages of the given pool.
  */
 static u64 z3fold_get_pool_size(struct z3fold_pool *pool)
 {
-	return pool->pages_nr;
+	return atomic64_read(&pool->pages_nr);
 }
 
 /*****************
@@ -776,8 +901,8 @@ MODULE_ALIAS("zpool-z3fold");
 
 static int __init init_z3fold(void)
 {
-	/* Make sure the z3fold header will fit in one chunk */
-	BUILD_BUG_ON(sizeof(struct z3fold_header) > ZHDR_SIZE_ALIGNED);
+	/* Make sure the z3fold header is not larger than the page size */
+	BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE);
 	zpool_register_driver(&z3fold_zpool_driver);
 
 	return 0;
diff --git a/scripts/gdb/linux/constants.py.in b/scripts/gdb/linux/constants.py.in
index 7986f4e0da12..7aad82406422 100644
--- a/scripts/gdb/linux/constants.py.in
+++ b/scripts/gdb/linux/constants.py.in
@@ -14,6 +14,7 @@
 
 #include <linux/fs.h>
 #include <linux/mount.h>
+#include <linux/of_fdt.h>
 
 /* We need to stringify expanded macros so that they can be parsed */
 
@@ -50,3 +51,9 @@ LX_VALUE(MNT_NOEXEC)
 LX_VALUE(MNT_NOATIME)
 LX_VALUE(MNT_NODIRATIME)
 LX_VALUE(MNT_RELATIME)
+
+/* linux/of_fdt.h> */
+LX_VALUE(OF_DT_HEADER)
+
+/* Kernel Configs */
+LX_CONFIG(CONFIG_OF)
diff --git a/scripts/gdb/linux/proc.py b/scripts/gdb/linux/proc.py
index 38b1f09d1cd9..086d27223c0c 100644
--- a/scripts/gdb/linux/proc.py
+++ b/scripts/gdb/linux/proc.py
@@ -16,6 +16,7 @@ from linux import constants
 from linux import utils
 from linux import tasks
 from linux import lists
+from struct import *
 
 
 class LxCmdLine(gdb.Command):
@@ -195,3 +196,75 @@ values of that process namespace"""
                         info_opts(MNT_INFO, m_flags)))
 
 LxMounts()
+
+
+class LxFdtDump(gdb.Command):
+    """Output Flattened Device Tree header and dump FDT blob to the filename
+       specified as the command argument. Equivalent to
+       'cat /proc/fdt > fdtdump.dtb' on a running target"""
+
+    def __init__(self):
+        super(LxFdtDump, self).__init__("lx-fdtdump", gdb.COMMAND_DATA,
+                                        gdb.COMPLETE_FILENAME)
+
+    def fdthdr_to_cpu(self, fdt_header):
+
+        fdt_header_be = ">IIIIIII"
+        fdt_header_le = "<IIIIIII"
+
+        if utils.get_target_endianness() == 1:
+            output_fmt = fdt_header_le
+        else:
+            output_fmt = fdt_header_be
+
+        return unpack(output_fmt, pack(fdt_header_be,
+                                       fdt_header['magic'],
+                                       fdt_header['totalsize'],
+                                       fdt_header['off_dt_struct'],
+                                       fdt_header['off_dt_strings'],
+                                       fdt_header['off_mem_rsvmap'],
+                                       fdt_header['version'],
+                                       fdt_header['last_comp_version']))
+
+    def invoke(self, arg, from_tty):
+
+        if not constants.LX_CONFIG_OF:
+            raise gdb.GdbError("Kernel not compiled with CONFIG_OF\n")
+
+        if len(arg) == 0:
+            filename = "fdtdump.dtb"
+        else:
+            filename = arg
+
+        py_fdt_header_ptr = gdb.parse_and_eval(
+            "(const struct fdt_header *) initial_boot_params")
+        py_fdt_header = py_fdt_header_ptr.dereference()
+
+        fdt_header = self.fdthdr_to_cpu(py_fdt_header)
+
+        if fdt_header[0] != constants.LX_OF_DT_HEADER:
+            raise gdb.GdbError("No flattened device tree magic found\n")
+
+        gdb.write("fdt_magic:         0x{:02X}\n".format(fdt_header[0]))
+        gdb.write("fdt_totalsize:     0x{:02X}\n".format(fdt_header[1]))
+        gdb.write("off_dt_struct:     0x{:02X}\n".format(fdt_header[2]))
+        gdb.write("off_dt_strings:    0x{:02X}\n".format(fdt_header[3]))
+        gdb.write("off_mem_rsvmap:    0x{:02X}\n".format(fdt_header[4]))
+        gdb.write("version:           {}\n".format(fdt_header[5]))
+        gdb.write("last_comp_version: {}\n".format(fdt_header[6]))
+
+        inf = gdb.inferiors()[0]
+        fdt_buf = utils.read_memoryview(inf, py_fdt_header_ptr,
+                                        fdt_header[1]).tobytes()
+
+        try:
+            f = open(filename, 'wb')
+        except:
+            raise gdb.GdbError("Could not open file to dump fdt")
+
+        f.write(fdt_buf)
+        f.close()
+
+        gdb.write("Dumped fdt blob to " + filename + "\n")
+
+LxFdtDump()
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index 163c720d3f2b..ac5bf8708ff0 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -16,6 +16,7 @@ absense||absence
 absolut||absolute
 absoulte||absolute
 acccess||access
+acceess||access
 acceleratoin||acceleration
 accelleration||acceleration
 accesing||accessing
@@ -46,6 +47,7 @@ ackowledged||acknowledged
 acording||according
 activete||activate
 acumulating||accumulating
+acumulator||accumulator
 adapater||adapter
 addional||additional
 additionaly||additionally
@@ -184,6 +186,7 @@ cacluated||calculated
 caculation||calculation
 calender||calendar
 calle||called
+callibration||calibration
 calucate||calculate
 calulate||calculate
 cancelation||cancellation
@@ -244,6 +247,7 @@ compatiblity||compatibility
 competion||completion
 compilant||compliant
 compleatly||completely
+completition||completion
 completly||completely
 complient||compliant
 componnents||components
@@ -257,6 +261,7 @@ conected||connected
 configuratoin||configuration
 configuraton||configuration
 configuretion||configuration
+configutation||configuration
 conider||consider
 conjuction||conjunction
 connectinos||connections
@@ -317,6 +322,7 @@ dependant||dependent
 depreacted||deprecated
 depreacte||deprecate
 desactivate||deactivate
+desciptor||descriptor
 desciptors||descriptors
 descripton||description
 descrition||description
@@ -417,9 +423,12 @@ extention||extension
 extracter||extractor
 faild||failed
 faill||fail
+failied||failed
+faillure||failure
 failue||failure
 failuer||failure
 faireness||fairness
+falied||failed
 faliure||failure
 familar||familiar
 fatser||faster
@@ -441,6 +450,7 @@ forseeable||foreseeable
 forse||force
 fortan||fortran
 forwardig||forwarding
+framming||framing
 framwork||framework
 frequncy||frequency
 frome||from
@@ -482,6 +492,7 @@ howver||however
 hsould||should
 hypter||hyper
 identidier||identifier
+illigal||illegal
 imblance||imbalance
 immeadiately||immediately
 immedaite||immediate
@@ -520,6 +531,7 @@ informtion||information
 infromation||information
 ingore||ignore
 inital||initial
+initalized||initialized
 initalised||initialized
 initalise||initialize
 initalize||initialize
@@ -532,6 +544,7 @@ initilize||initialize
 inofficial||unofficial
 insititute||institute
 instal||install
+instanciated||instantiated
 inteface||interface
 integreated||integrated
 integrety||integrity
@@ -557,6 +570,7 @@ intialized||initialized
 intialize||initialize
 intregral||integral
 intrrupt||interrupt
+intterrupt||interrupt
 intuative||intuitive
 invaid||invalid
 invalde||invald
@@ -567,6 +581,8 @@ invokations||invocations
 irrelevent||irrelevant
 isnt||isn't
 isssue||issue
+iternations||iterations
+itertation||iteration
 itslef||itself
 jave||java
 jeffies||jiffies
@@ -621,6 +637,7 @@ messsage||message
 messsages||messages
 microprocesspr||microprocessor
 milliseonds||milliseconds
+minium||minimum
 minumum||minimum
 miscelleneous||miscellaneous
 misformed||malformed
@@ -668,6 +685,7 @@ occurances||occurrences
 occured||occurred
 occurence||occurrence
 occure||occurred
+occured||occurred
 occuring||occurring
 offet||offset
 omitt||omit
@@ -681,8 +699,10 @@ optionnal||optional
 optmizations||optimizations
 orientatied||orientated
 orientied||oriented
+orignal||original
 otherise||otherwise
 ouput||output
+oustanding||outstanding
 overaall||overall
 overhread||overhead
 overlaping||overlapping
@@ -705,6 +725,7 @@ paramter||parameter
 paramters||parameters
 particuarly||particularly
 particularily||particularly
+partiton||partition
 pased||passed
 passin||passing
 pathes||paths
@@ -724,6 +745,7 @@ pleaes||please
 ploting||plotting
 plugable||pluggable
 poinnter||pointer
+pointeur||pointer
 poiter||pointer
 posible||possible
 positon||position
@@ -752,6 +774,7 @@ procceed||proceed
 proccesors||processors
 procesed||processed
 proces||process
+procesing||processing
 processessing||processing
 processess||processes
 processpr||processor
@@ -780,6 +803,7 @@ protable||portable
 protcol||protocol
 protecion||protection
 protocoll||protocol
+promixity||proximity
 psudo||pseudo
 psuedo||pseudo
 psychadelic||psychedelic
@@ -801,6 +825,7 @@ recommanded||recommended
 recyle||recycle
 redircet||redirect
 redirectrion||redirection
+reename||rename
 refcounf||refcount
 refence||reference
 refered||referred
@@ -944,7 +969,9 @@ suble||subtle
 substract||subtract
 succesfully||successfully
 succesful||successful
+successed||succeeded
 successfull||successful
+successfuly||successfully
 sucessfully||successfully
 sucess||success
 superflous||superfluous
@@ -960,6 +987,7 @@ suppport||support
 supress||suppress
 surpresses||suppresses
 susbsystem||subsystem
+suspeneded||suspended
 suspicously||suspiciously
 swaping||swapping
 switchs||switches
@@ -967,6 +995,7 @@ symetric||symmetric
 synax||syntax
 synchonized||synchronized
 syncronize||synchronize
+syncronized||synchronized
 syncronizing||synchronizing
 syncronus||synchronous
 syste||system
@@ -1005,22 +1034,30 @@ ture||true
 tyep||type
 udpate||update
 uesd||used
+uncommited||uncommitted
 unconditionaly||unconditionally
 underun||underrun
 unecessary||unnecessary
 unexecpted||unexpected
+unexpcted||unexpected
 unexpectd||unexpected
 unexpeted||unexpected
+unexpexted||unexpected
 unfortunatelly||unfortunately
 unifiy||unify
 unintialized||uninitialized
+unkmown||unknown
 unknonw||unknown
 unknow||unknown
 unkown||unknown
 unneedingly||unnecessarily
+unnsupported||unsupported
+unmached||unmatched
 unresgister||unregister
+unrgesiter||unregister
 unsinged||unsigned
 unstabel||unstable
+unsolicitied||unsolicited
 unsuccessfull||unsuccessful
 unsuported||unsupported
 untill||until
diff --git a/security/integrity/ima/Kconfig b/security/integrity/ima/Kconfig
index 5487827fa86c..370eb2f4dd37 100644
--- a/security/integrity/ima/Kconfig
+++ b/security/integrity/ima/Kconfig
@@ -27,6 +27,18 @@ config IMA
 	  to learn more about IMA.
 	  If unsure, say N.
 
+config IMA_KEXEC
+	bool "Enable carrying the IMA measurement list across a soft boot"
+	depends on IMA && TCG_TPM && HAVE_IMA_KEXEC
+	default n
+	help
+	   TPM PCRs are only reset on a hard reboot.  In order to validate
+	   a TPM's quote after a soft boot, the IMA measurement list of the
+	   running kernel must be saved and restored on boot.
+
+	   Depending on the IMA policy, the measurement list can grow to
+	   be very large.
+
 config IMA_MEASURE_PCR_IDX
 	int
 	depends on IMA
diff --git a/security/integrity/ima/Makefile b/security/integrity/ima/Makefile
index 9aeaedad1e2b..29f198bde02b 100644
--- a/security/integrity/ima/Makefile
+++ b/security/integrity/ima/Makefile
@@ -8,4 +8,5 @@ obj-$(CONFIG_IMA) += ima.o
 ima-y := ima_fs.o ima_queue.o ima_init.o ima_main.o ima_crypto.o ima_api.o \
 	 ima_policy.o ima_template.o ima_template_lib.o
 ima-$(CONFIG_IMA_APPRAISE) += ima_appraise.o
+ima-$(CONFIG_HAVE_IMA_KEXEC) += ima_kexec.o
 obj-$(CONFIG_IMA_BLACKLIST_KEYRING) += ima_mok.o
diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index db25f54a04fe..5e6180a4da7d 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -28,6 +28,10 @@
 
 #include "../integrity.h"
 
+#ifdef CONFIG_HAVE_IMA_KEXEC
+#include <asm/ima.h>
+#endif
+
 enum ima_show_type { IMA_SHOW_BINARY, IMA_SHOW_BINARY_NO_FIELD_LEN,
 		     IMA_SHOW_BINARY_OLD_STRING_FMT, IMA_SHOW_ASCII };
 enum tpm_pcrs { TPM_PCR0 = 0, TPM_PCR8 = 8 };
@@ -81,6 +85,7 @@ struct ima_template_field {
 
 /* IMA template descriptor definition */
 struct ima_template_desc {
+	struct list_head list;
 	char *name;
 	char *fmt;
 	int num_fields;
@@ -102,6 +107,27 @@ struct ima_queue_entry {
 };
 extern struct list_head ima_measurements;	/* list of all measurements */
 
+/* Some details preceding the binary serialized measurement list */
+struct ima_kexec_hdr {
+	u16 version;
+	u16 _reserved0;
+	u32 _reserved1;
+	u64 buffer_size;
+	u64 count;
+};
+
+#ifdef CONFIG_HAVE_IMA_KEXEC
+void ima_load_kexec_buffer(void);
+#else
+static inline void ima_load_kexec_buffer(void) {}
+#endif /* CONFIG_HAVE_IMA_KEXEC */
+
+/*
+ * The default binary_runtime_measurements list format is defined as the
+ * platform native format.  The canonical format is defined as little-endian.
+ */
+extern bool ima_canonical_fmt;
+
 /* Internal IMA function definitions */
 int ima_init(void);
 int ima_fs_init(void);
@@ -122,7 +148,12 @@ int ima_init_crypto(void);
 void ima_putc(struct seq_file *m, void *data, int datalen);
 void ima_print_digest(struct seq_file *m, u8 *digest, u32 size);
 struct ima_template_desc *ima_template_desc_current(void);
+int ima_restore_measurement_entry(struct ima_template_entry *entry);
+int ima_restore_measurement_list(loff_t bufsize, void *buf);
+int ima_measurements_show(struct seq_file *m, void *v);
+unsigned long ima_get_binary_runtime_size(void);
 int ima_init_template(void);
+void ima_init_template_list(void);
 
 /*
  * used to protect h_table and sha_table
diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c
index 38f2ed830dd6..802d5d20f36f 100644
--- a/security/integrity/ima/ima_crypto.c
+++ b/security/integrity/ima/ima_crypto.c
@@ -477,11 +477,13 @@ static int ima_calc_field_array_hash_tfm(struct ima_field_data *field_data,
 		u8 buffer[IMA_EVENT_NAME_LEN_MAX + 1] = { 0 };
 		u8 *data_to_hash = field_data[i].data;
 		u32 datalen = field_data[i].len;
+		u32 datalen_to_hash =
+		    !ima_canonical_fmt ? datalen : cpu_to_le32(datalen);
 
 		if (strcmp(td->name, IMA_TEMPLATE_IMA_NAME) != 0) {
 			rc = crypto_shash_update(shash,
-						(const u8 *) &field_data[i].len,
-						sizeof(field_data[i].len));
+						(const u8 *) &datalen_to_hash,
+						sizeof(datalen_to_hash));
 			if (rc)
 				break;
 		} else if (strcmp(td->fields[i]->field_id, "n") == 0) {
diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c
index 3df46906492d..ca303e5d2b94 100644
--- a/security/integrity/ima/ima_fs.c
+++ b/security/integrity/ima/ima_fs.c
@@ -28,6 +28,16 @@
 
 static DEFINE_MUTEX(ima_write_mutex);
 
+bool ima_canonical_fmt;
+static int __init default_canonical_fmt_setup(char *str)
+{
+#ifdef __BIG_ENDIAN
+	ima_canonical_fmt = 1;
+#endif
+	return 1;
+}
+__setup("ima_canonical_fmt", default_canonical_fmt_setup);
+
 static int valid_policy = 1;
 #define TMPBUFLEN 12
 static ssize_t ima_show_htable_value(char __user *buf, size_t count,
@@ -116,13 +126,13 @@ void ima_putc(struct seq_file *m, void *data, int datalen)
  *       [eventdata length]
  *       eventdata[n]=template specific data
  */
-static int ima_measurements_show(struct seq_file *m, void *v)
+int ima_measurements_show(struct seq_file *m, void *v)
 {
 	/* the list never shrinks, so we don't need a lock here */
 	struct ima_queue_entry *qe = v;
 	struct ima_template_entry *e;
 	char *template_name;
-	int namelen;
+	u32 pcr, namelen, template_data_len; /* temporary fields */
 	bool is_ima_template = false;
 	int i;
 
@@ -139,25 +149,29 @@ static int ima_measurements_show(struct seq_file *m, void *v)
 	 * PCR used defaults to the same (config option) in
 	 * little-endian format, unless set in policy
 	 */
-	ima_putc(m, &e->pcr, sizeof(e->pcr));
+	pcr = !ima_canonical_fmt ? e->pcr : cpu_to_le32(e->pcr);
+	ima_putc(m, &pcr, sizeof(e->pcr));
 
 	/* 2nd: template digest */
 	ima_putc(m, e->digest, TPM_DIGEST_SIZE);
 
 	/* 3rd: template name size */
-	namelen = strlen(template_name);
+	namelen = !ima_canonical_fmt ? strlen(template_name) :
+		cpu_to_le32(strlen(template_name));
 	ima_putc(m, &namelen, sizeof(namelen));
 
 	/* 4th:  template name */
-	ima_putc(m, template_name, namelen);
+	ima_putc(m, template_name, strlen(template_name));
 
 	/* 5th:  template length (except for 'ima' template) */
 	if (strcmp(template_name, IMA_TEMPLATE_IMA_NAME) == 0)
 		is_ima_template = true;
 
-	if (!is_ima_template)
-		ima_putc(m, &e->template_data_len,
-			 sizeof(e->template_data_len));
+	if (!is_ima_template) {
+		template_data_len = !ima_canonical_fmt ? e->template_data_len :
+			cpu_to_le32(e->template_data_len);
+		ima_putc(m, &template_data_len, sizeof(e->template_data_len));
+	}
 
 	/* 6th:  template specific data */
 	for (i = 0; i < e->template_desc->num_fields; i++) {
diff --git a/security/integrity/ima/ima_init.c b/security/integrity/ima/ima_init.c
index 2ac1f41db5c0..2967d497a665 100644
--- a/security/integrity/ima/ima_init.c
+++ b/security/integrity/ima/ima_init.c
@@ -129,6 +129,8 @@ int __init ima_init(void)
 	if (rc != 0)
 		return rc;
 
+	ima_load_kexec_buffer();
+
 	rc = ima_add_boot_aggregate();	/* boot aggregate must be first entry */
 	if (rc != 0)
 		return rc;
diff --git a/security/integrity/ima/ima_kexec.c b/security/integrity/ima/ima_kexec.c
new file mode 100644
index 000000000000..e473eee913cb
--- /dev/null
+++ b/security/integrity/ima/ima_kexec.c
@@ -0,0 +1,168 @@
+/*
+ * Copyright (C) 2016 IBM Corporation
+ *
+ * Authors:
+ * Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
+ * Mimi Zohar <zohar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
+#include <linux/kexec.h>
+#include "ima.h"
+
+#ifdef CONFIG_IMA_KEXEC
+static int ima_dump_measurement_list(unsigned long *buffer_size, void **buffer,
+				     unsigned long segment_size)
+{
+	struct ima_queue_entry *qe;
+	struct seq_file file;
+	struct ima_kexec_hdr khdr;
+	int ret = 0;
+
+	/* segment size can't change between kexec load and execute */
+	file.buf = vmalloc(segment_size);
+	if (!file.buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	file.size = segment_size;
+	file.read_pos = 0;
+	file.count = sizeof(khdr);	/* reserved space */
+
+	memset(&khdr, 0, sizeof(khdr));
+	khdr.version = 1;
+	list_for_each_entry_rcu(qe, &ima_measurements, later) {
+		if (file.count < file.size) {
+			khdr.count++;
+			ima_measurements_show(&file, qe);
+		} else {
+			ret = -EINVAL;
+			break;
+		}
+	}
+
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * fill in reserved space with some buffer details
+	 * (eg. version, buffer size, number of measurements)
+	 */
+	khdr.buffer_size = file.count;
+	if (ima_canonical_fmt) {
+		khdr.version = cpu_to_le16(khdr.version);
+		khdr.count = cpu_to_le64(khdr.count);
+		khdr.buffer_size = cpu_to_le64(khdr.buffer_size);
+	}
+	memcpy(file.buf, &khdr, sizeof(khdr));
+
+	print_hex_dump(KERN_DEBUG, "ima dump: ", DUMP_PREFIX_NONE,
+			16, 1, file.buf,
+			file.count < 100 ? file.count : 100, true);
+
+	*buffer_size = file.count;
+	*buffer = file.buf;
+out:
+	if (ret == -EINVAL)
+		vfree(file.buf);
+	return ret;
+}
+
+/*
+ * Called during kexec_file_load so that IMA can add a segment to the kexec
+ * image for the measurement list for the next kernel.
+ *
+ * This function assumes that kexec_mutex is held.
+ */
+void ima_add_kexec_buffer(struct kimage *image)
+{
+	struct kexec_buf kbuf = { .image = image, .buf_align = PAGE_SIZE,
+				  .buf_min = 0, .buf_max = ULONG_MAX,
+				  .top_down = true };
+	unsigned long binary_runtime_size;
+
+	/* use more understandable variable names than defined in kbuf */
+	void *kexec_buffer = NULL;
+	size_t kexec_buffer_size;
+	size_t kexec_segment_size;
+	int ret;
+
+	/*
+	 * Reserve an extra half page of memory for additional measurements
+	 * added during the kexec load.
+	 */
+	binary_runtime_size = ima_get_binary_runtime_size();
+	if (binary_runtime_size >= ULONG_MAX - PAGE_SIZE)
+		kexec_segment_size = ULONG_MAX;
+	else
+		kexec_segment_size = ALIGN(ima_get_binary_runtime_size() +
+					   PAGE_SIZE / 2, PAGE_SIZE);
+	if ((kexec_segment_size == ULONG_MAX) ||
+	    ((kexec_segment_size >> PAGE_SHIFT) > totalram_pages / 2)) {
+		pr_err("Binary measurement list too large.\n");
+		return;
+	}
+
+	ima_dump_measurement_list(&kexec_buffer_size, &kexec_buffer,
+				  kexec_segment_size);
+	if (!kexec_buffer) {
+		pr_err("Not enough memory for the kexec measurement buffer.\n");
+		return;
+	}
+
+	kbuf.buffer = kexec_buffer;
+	kbuf.bufsz = kexec_buffer_size;
+	kbuf.memsz = kexec_segment_size;
+	ret = kexec_add_buffer(&kbuf);
+	if (ret) {
+		pr_err("Error passing over kexec measurement buffer.\n");
+		return;
+	}
+
+	ret = arch_ima_add_kexec_buffer(image, kbuf.mem, kexec_segment_size);
+	if (ret) {
+		pr_err("Error passing over kexec measurement buffer.\n");
+		return;
+	}
+
+	pr_debug("kexec measurement buffer for the loaded kernel at 0x%lx.\n",
+		 kbuf.mem);
+}
+#endif /* IMA_KEXEC */
+
+/*
+ * Restore the measurement list from the previous kernel.
+ */
+void ima_load_kexec_buffer(void)
+{
+	void *kexec_buffer = NULL;
+	size_t kexec_buffer_size = 0;
+	int rc;
+
+	rc = ima_get_kexec_buffer(&kexec_buffer, &kexec_buffer_size);
+	switch (rc) {
+	case 0:
+		rc = ima_restore_measurement_list(kexec_buffer_size,
+						  kexec_buffer);
+		if (rc != 0)
+			pr_err("Failed to restore the measurement list: %d\n",
+				rc);
+
+		ima_free_kexec_buffer();
+		break;
+	case -ENOTSUPP:
+		pr_debug("Restoring the measurement list not supported\n");
+		break;
+	case -ENOENT:
+		pr_debug("No measurement list to restore\n");
+		break;
+	default:
+		pr_debug("Error restoring the measurement list: %d\n", rc);
+	}
+}
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 423d111b3b94..50818c60538b 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -418,6 +418,7 @@ static int __init init_ima(void)
 {
 	int error;
 
+	ima_init_template_list();
 	hash_setup(CONFIG_IMA_DEFAULT_HASH);
 	error = ima_init();
 	if (!error) {
diff --git a/security/integrity/ima/ima_queue.c b/security/integrity/ima/ima_queue.c
index 32f6ac0f96df..d9aa5ab71204 100644
--- a/security/integrity/ima/ima_queue.c
+++ b/security/integrity/ima/ima_queue.c
@@ -29,6 +29,11 @@
 #define AUDIT_CAUSE_LEN_MAX 32
 
 LIST_HEAD(ima_measurements);	/* list of all measurements */
+#ifdef CONFIG_IMA_KEXEC
+static unsigned long binary_runtime_size;
+#else
+static unsigned long binary_runtime_size = ULONG_MAX;
+#endif
 
 /* key: inode (before secure-hashing a file) */
 struct ima_h_table ima_htable = {
@@ -64,12 +69,32 @@ static struct ima_queue_entry *ima_lookup_digest_entry(u8 *digest_value,
 	return ret;
 }
 
+/*
+ * Calculate the memory required for serializing a single
+ * binary_runtime_measurement list entry, which contains a
+ * couple of variable length fields (e.g template name and data).
+ */
+static int get_binary_runtime_size(struct ima_template_entry *entry)
+{
+	int size = 0;
+
+	size += sizeof(u32);	/* pcr */
+	size += sizeof(entry->digest);
+	size += sizeof(int);	/* template name size field */
+	size += strlen(entry->template_desc->name) + 1;
+	size += sizeof(entry->template_data_len);
+	size += entry->template_data_len;
+	return size;
+}
+
 /* ima_add_template_entry helper function:
- * - Add template entry to measurement list and hash table.
+ * - Add template entry to the measurement list and hash table, for
+ *   all entries except those carried across kexec.
  *
  * (Called with ima_extend_list_mutex held.)
  */
-static int ima_add_digest_entry(struct ima_template_entry *entry)
+static int ima_add_digest_entry(struct ima_template_entry *entry,
+				bool update_htable)
 {
 	struct ima_queue_entry *qe;
 	unsigned int key;
@@ -85,11 +110,34 @@ static int ima_add_digest_entry(struct ima_template_entry *entry)
 	list_add_tail_rcu(&qe->later, &ima_measurements);
 
 	atomic_long_inc(&ima_htable.len);
-	key = ima_hash_key(entry->digest);
-	hlist_add_head_rcu(&qe->hnext, &ima_htable.queue[key]);
+	if (update_htable) {
+		key = ima_hash_key(entry->digest);
+		hlist_add_head_rcu(&qe->hnext, &ima_htable.queue[key]);
+	}
+
+	if (binary_runtime_size != ULONG_MAX) {
+		int size;
+
+		size = get_binary_runtime_size(entry);
+		binary_runtime_size = (binary_runtime_size < ULONG_MAX - size) ?
+		     binary_runtime_size + size : ULONG_MAX;
+	}
 	return 0;
 }
 
+/*
+ * Return the amount of memory required for serializing the
+ * entire binary_runtime_measurement list, including the ima_kexec_hdr
+ * structure.
+ */
+unsigned long ima_get_binary_runtime_size(void)
+{
+	if (binary_runtime_size >= (ULONG_MAX - sizeof(struct ima_kexec_hdr)))
+		return ULONG_MAX;
+	else
+		return binary_runtime_size + sizeof(struct ima_kexec_hdr);
+};
+
 static int ima_pcr_extend(const u8 *hash, int pcr)
 {
 	int result = 0;
@@ -103,8 +151,13 @@ static int ima_pcr_extend(const u8 *hash, int pcr)
 	return result;
 }
 
-/* Add template entry to the measurement list and hash table,
- * and extend the pcr.
+/*
+ * Add template entry to the measurement list and hash table, and
+ * extend the pcr.
+ *
+ * On systems which support carrying the IMA measurement list across
+ * kexec, maintain the total memory size required for serializing the
+ * binary_runtime_measurements.
  */
 int ima_add_template_entry(struct ima_template_entry *entry, int violation,
 			   const char *op, struct inode *inode,
@@ -126,7 +179,7 @@ int ima_add_template_entry(struct ima_template_entry *entry, int violation,
 		}
 	}
 
-	result = ima_add_digest_entry(entry);
+	result = ima_add_digest_entry(entry, 1);
 	if (result < 0) {
 		audit_cause = "ENOMEM";
 		audit_info = 0;
@@ -149,3 +202,13 @@ out:
 			    op, audit_cause, result, audit_info);
 	return result;
 }
+
+int ima_restore_measurement_entry(struct ima_template_entry *entry)
+{
+	int result = 0;
+
+	mutex_lock(&ima_extend_list_mutex);
+	result = ima_add_digest_entry(entry, 0);
+	mutex_unlock(&ima_extend_list_mutex);
+	return result;
+}
diff --git a/security/integrity/ima/ima_template.c b/security/integrity/ima/ima_template.c
index febd12ed9b55..cebb37c63629 100644
--- a/security/integrity/ima/ima_template.c
+++ b/security/integrity/ima/ima_template.c
@@ -15,16 +15,20 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/rculist.h>
 #include "ima.h"
 #include "ima_template_lib.h"
 
-static struct ima_template_desc defined_templates[] = {
+static struct ima_template_desc builtin_templates[] = {
 	{.name = IMA_TEMPLATE_IMA_NAME, .fmt = IMA_TEMPLATE_IMA_FMT},
 	{.name = "ima-ng", .fmt = "d-ng|n-ng"},
 	{.name = "ima-sig", .fmt = "d-ng|n-ng|sig"},
 	{.name = "", .fmt = ""},	/* placeholder for a custom format */
 };
 
+static LIST_HEAD(defined_templates);
+static DEFINE_SPINLOCK(template_list);
+
 static struct ima_template_field supported_fields[] = {
 	{.field_id = "d", .field_init = ima_eventdigest_init,
 	 .field_show = ima_show_template_digest},
@@ -37,6 +41,7 @@ static struct ima_template_field supported_fields[] = {
 	{.field_id = "sig", .field_init = ima_eventsig_init,
 	 .field_show = ima_show_template_sig},
 };
+#define MAX_TEMPLATE_NAME_LEN 15
 
 static struct ima_template_desc *ima_template;
 static struct ima_template_desc *lookup_template_desc(const char *name);
@@ -52,6 +57,8 @@ static int __init ima_template_setup(char *str)
 	if (ima_template)
 		return 1;
 
+	ima_init_template_list();
+
 	/*
 	 * Verify that a template with the supplied name exists.
 	 * If not, use CONFIG_IMA_DEFAULT_TEMPLATE.
@@ -80,7 +87,7 @@ __setup("ima_template=", ima_template_setup);
 
 static int __init ima_template_fmt_setup(char *str)
 {
-	int num_templates = ARRAY_SIZE(defined_templates);
+	int num_templates = ARRAY_SIZE(builtin_templates);
 
 	if (ima_template)
 		return 1;
@@ -91,22 +98,28 @@ static int __init ima_template_fmt_setup(char *str)
 		return 1;
 	}
 
-	defined_templates[num_templates - 1].fmt = str;
-	ima_template = defined_templates + num_templates - 1;
+	builtin_templates[num_templates - 1].fmt = str;
+	ima_template = builtin_templates + num_templates - 1;
+
 	return 1;
 }
 __setup("ima_template_fmt=", ima_template_fmt_setup);
 
 static struct ima_template_desc *lookup_template_desc(const char *name)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(defined_templates); i++) {
-		if (strcmp(defined_templates[i].name, name) == 0)
-			return defined_templates + i;
+	struct ima_template_desc *template_desc;
+	int found = 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(template_desc, &defined_templates, list) {
+		if ((strcmp(template_desc->name, name) == 0) ||
+		    (strcmp(template_desc->fmt, name) == 0)) {
+			found = 1;
+			break;
+		}
 	}
-
-	return NULL;
+	rcu_read_unlock();
+	return found ? template_desc : NULL;
 }
 
 static struct ima_template_field *lookup_template_field(const char *field_id)
@@ -142,9 +155,14 @@ static int template_desc_init_fields(const char *template_fmt,
 {
 	const char *template_fmt_ptr;
 	struct ima_template_field *found_fields[IMA_TEMPLATE_NUM_FIELDS_MAX];
-	int template_num_fields = template_fmt_size(template_fmt);
+	int template_num_fields;
 	int i, len;
 
+	if (num_fields && *num_fields > 0) /* already initialized? */
+		return 0;
+
+	template_num_fields = template_fmt_size(template_fmt);
+
 	if (template_num_fields > IMA_TEMPLATE_NUM_FIELDS_MAX) {
 		pr_err("format string '%s' contains too many fields\n",
 		       template_fmt);
@@ -182,11 +200,28 @@ static int template_desc_init_fields(const char *template_fmt,
 	return 0;
 }
 
+void ima_init_template_list(void)
+{
+	int i;
+
+	if (!list_empty(&defined_templates))
+		return;
+
+	spin_lock(&template_list);
+	for (i = 0; i < ARRAY_SIZE(builtin_templates); i++) {
+		list_add_tail_rcu(&builtin_templates[i].list,
+				  &defined_templates);
+	}
+	spin_unlock(&template_list);
+}
+
 struct ima_template_desc *ima_template_desc_current(void)
 {
-	if (!ima_template)
+	if (!ima_template) {
+		ima_init_template_list();
 		ima_template =
 		    lookup_template_desc(CONFIG_IMA_DEFAULT_TEMPLATE);
+	}
 	return ima_template;
 }
 
@@ -205,3 +240,239 @@ int __init ima_init_template(void)
 
 	return result;
 }
+
+static struct ima_template_desc *restore_template_fmt(char *template_name)
+{
+	struct ima_template_desc *template_desc = NULL;
+	int ret;
+
+	ret = template_desc_init_fields(template_name, NULL, NULL);
+	if (ret < 0) {
+		pr_err("attempting to initialize the template \"%s\" failed\n",
+			template_name);
+		goto out;
+	}
+
+	template_desc = kzalloc(sizeof(*template_desc), GFP_KERNEL);
+	if (!template_desc)
+		goto out;
+
+	template_desc->name = "";
+	template_desc->fmt = kstrdup(template_name, GFP_KERNEL);
+	if (!template_desc->fmt)
+		goto out;
+
+	spin_lock(&template_list);
+	list_add_tail_rcu(&template_desc->list, &defined_templates);
+	spin_unlock(&template_list);
+out:
+	return template_desc;
+}
+
+static int ima_restore_template_data(struct ima_template_desc *template_desc,
+				     void *template_data,
+				     int template_data_size,
+				     struct ima_template_entry **entry)
+{
+	struct binary_field_data {
+		u32 len;
+		u8 data[0];
+	} __packed;
+
+	struct binary_field_data *field_data;
+	int offset = 0;
+	int ret = 0;
+	int i;
+
+	*entry = kzalloc(sizeof(**entry) +
+		    template_desc->num_fields * sizeof(struct ima_field_data),
+		    GFP_NOFS);
+	if (!*entry)
+		return -ENOMEM;
+
+	(*entry)->template_desc = template_desc;
+	for (i = 0; i < template_desc->num_fields; i++) {
+		field_data = template_data + offset;
+
+		/* Each field of the template data is prefixed with a length. */
+		if (offset > (template_data_size - sizeof(*field_data))) {
+			pr_err("Restoring the template field failed\n");
+			ret = -EINVAL;
+			break;
+		}
+		offset += sizeof(*field_data);
+
+		if (ima_canonical_fmt)
+			field_data->len = le32_to_cpu(field_data->len);
+
+		if (offset > (template_data_size - field_data->len)) {
+			pr_err("Restoring the template field data failed\n");
+			ret = -EINVAL;
+			break;
+		}
+		offset += field_data->len;
+
+		(*entry)->template_data[i].len = field_data->len;
+		(*entry)->template_data_len += sizeof(field_data->len);
+
+		(*entry)->template_data[i].data =
+			kzalloc(field_data->len + 1, GFP_KERNEL);
+		if (!(*entry)->template_data[i].data) {
+			ret = -ENOMEM;
+			break;
+		}
+		memcpy((*entry)->template_data[i].data, field_data->data,
+			field_data->len);
+		(*entry)->template_data_len += field_data->len;
+	}
+
+	if (ret < 0) {
+		ima_free_template_entry(*entry);
+		*entry = NULL;
+	}
+
+	return ret;
+}
+
+/* Restore the serialized binary measurement list without extending PCRs. */
+int ima_restore_measurement_list(loff_t size, void *buf)
+{
+	struct binary_hdr_v1 {
+		u32 pcr;
+		u8 digest[TPM_DIGEST_SIZE];
+		u32 template_name_len;
+		char template_name[0];
+	} __packed;
+	char template_name[MAX_TEMPLATE_NAME_LEN];
+
+	struct binary_data_v1 {
+		u32 template_data_size;
+		char template_data[0];
+	} __packed;
+
+	struct ima_kexec_hdr *khdr = buf;
+	struct binary_hdr_v1 *hdr_v1;
+	struct binary_data_v1 *data_v1;
+
+	void *bufp = buf + sizeof(*khdr);
+	void *bufendp;
+	struct ima_template_entry *entry;
+	struct ima_template_desc *template_desc;
+	unsigned long count = 0;
+	int ret = 0;
+
+	if (!buf || size < sizeof(*khdr))
+		return 0;
+
+	if (ima_canonical_fmt) {
+		khdr->version = le16_to_cpu(khdr->version);
+		khdr->count = le64_to_cpu(khdr->count);
+		khdr->buffer_size = le64_to_cpu(khdr->buffer_size);
+	}
+
+	if (khdr->version != 1) {
+		pr_err("attempting to restore a incompatible measurement list");
+		return -EINVAL;
+	}
+
+	if (khdr->count > ULONG_MAX - 1) {
+		pr_err("attempting to restore too many measurements");
+		return -EINVAL;
+	}
+
+	/*
+	 * ima kexec buffer prefix: version, buffer size, count
+	 * v1 format: pcr, digest, template-name-len, template-name,
+	 *	      template-data-size, template-data
+	 */
+	bufendp = buf + khdr->buffer_size;
+	while ((bufp < bufendp) && (count++ < khdr->count)) {
+		hdr_v1 = bufp;
+		if (bufp > (bufendp - sizeof(*hdr_v1))) {
+			pr_err("attempting to restore partial measurement\n");
+			ret = -EINVAL;
+			break;
+		}
+		bufp += sizeof(*hdr_v1);
+
+		if (ima_canonical_fmt)
+			hdr_v1->template_name_len =
+			    le32_to_cpu(hdr_v1->template_name_len);
+
+		if ((hdr_v1->template_name_len >= MAX_TEMPLATE_NAME_LEN) ||
+		    (bufp > (bufendp - hdr_v1->template_name_len))) {
+			pr_err("attempting to restore a template name \
+				that is too long\n");
+			ret = -EINVAL;
+			break;
+		}
+		data_v1 = bufp += (u_int8_t)hdr_v1->template_name_len;
+
+		/* template name is not null terminated */
+		memcpy(template_name, hdr_v1->template_name,
+		       hdr_v1->template_name_len);
+		template_name[hdr_v1->template_name_len] = 0;
+
+		if (strcmp(template_name, "ima") == 0) {
+			pr_err("attempting to restore an unsupported \
+				template \"%s\" failed\n", template_name);
+			ret = -EINVAL;
+			break;
+		}
+
+		template_desc = lookup_template_desc(template_name);
+		if (!template_desc) {
+			template_desc = restore_template_fmt(template_name);
+			if (!template_desc)
+				break;
+		}
+
+		/*
+		 * Only the running system's template format is initialized
+		 * on boot.  As needed, initialize the other template formats.
+		 */
+		ret = template_desc_init_fields(template_desc->fmt,
+						&(template_desc->fields),
+						&(template_desc->num_fields));
+		if (ret < 0) {
+			pr_err("attempting to restore the template fmt \"%s\" \
+				failed\n", template_desc->fmt);
+			ret = -EINVAL;
+			break;
+		}
+
+		if (bufp > (bufendp - sizeof(data_v1->template_data_size))) {
+			pr_err("restoring the template data size failed\n");
+			ret = -EINVAL;
+			break;
+		}
+		bufp += (u_int8_t) sizeof(data_v1->template_data_size);
+
+		if (ima_canonical_fmt)
+			data_v1->template_data_size =
+			    le32_to_cpu(data_v1->template_data_size);
+
+		if (bufp > (bufendp - data_v1->template_data_size)) {
+			pr_err("restoring the template data failed\n");
+			ret = -EINVAL;
+			break;
+		}
+		bufp += data_v1->template_data_size;
+
+		ret = ima_restore_template_data(template_desc,
+						data_v1->template_data,
+						data_v1->template_data_size,
+						&entry);
+		if (ret < 0)
+			break;
+
+		memcpy(entry->digest, hdr_v1->digest, TPM_DIGEST_SIZE);
+		entry->pcr =
+		    !ima_canonical_fmt ? hdr_v1->pcr : le32_to_cpu(hdr_v1->pcr);
+		ret = ima_restore_measurement_entry(entry);
+		if (ret < 0)
+			break;
+
+	}
+	return ret;
+}
diff --git a/security/integrity/ima/ima_template_lib.c b/security/integrity/ima/ima_template_lib.c
index f9bae04ba176..f9ba37b3928d 100644
--- a/security/integrity/ima/ima_template_lib.c
+++ b/security/integrity/ima/ima_template_lib.c
@@ -103,8 +103,11 @@ static void ima_show_template_data_binary(struct seq_file *m,
 	u32 len = (show == IMA_SHOW_BINARY_OLD_STRING_FMT) ?
 	    strlen(field_data->data) : field_data->len;
 
-	if (show != IMA_SHOW_BINARY_NO_FIELD_LEN)
-		ima_putc(m, &len, sizeof(len));
+	if (show != IMA_SHOW_BINARY_NO_FIELD_LEN) {
+		u32 field_len = !ima_canonical_fmt ? len : cpu_to_le32(len);
+
+		ima_putc(m, &field_len, sizeof(field_len));
+	}
 
 	if (!len)
 		return;
author	Stephen Rothwell <sfr@canb.auug.org.au>	2016-12-20 11:54:45 +1100
committer	Stephen Rothwell <sfr@canb.auug.org.au>	2016-12-20 11:54:45 +1100
commit	09fc92cf819f627c5c6c2fe4b8921950d9f81275 (patch)
tree	9d43e3644f926bc169ab2a46945d6d61a8263247
parent	b8317fd612ceb1d118275e92f66ded6fdf0a2023 (diff)
parent	e1f8c36b6eb3b4b793064ecd4f440c02e7fdbe00 (diff)