summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2009-06-10 16:43:04 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2009-06-10 16:43:04 +1000
commite7eb44d1b11c11de19daf4a2a032494d1a9bd27c (patch)
tree2f5f2dbe5288471e247c6700485d2351530a5d2a
parentdd0cef1a1bee05543b5821c4664e353369f252de (diff)
parent8f859e1d843b10de912af02dc9cc54742c43f3e8 (diff)
Merge commit 'kmemcheck/auto-kmemcheck-next'
Conflicts: arch/x86/mm/fault.c include/linux/ring_buffer.h include/linux/slab.h kernel/trace/ring_buffer.c mm/Makefile mm/slab.c mm/slub.c
-rw-r--r--Documentation/kmemcheck.txt129
-rw-r--r--MAINTAINERS8
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/Kconfig.debug1
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/include/asm/dma-mapping.h8
-rw-r--r--arch/x86/include/asm/kmemcheck.h42
-rw-r--r--arch/x86/include/asm/page_types.h8
-rw-r--r--arch/x86/include/asm/pgtable.h5
-rw-r--r--arch/x86/include/asm/pgtable_types.h9
-rw-r--r--arch/x86/include/asm/string_32.h8
-rw-r--r--arch/x86/include/asm/string_64.h8
-rw-r--r--arch/x86/include/asm/thread_info.h4
-rw-r--r--arch/x86/include/asm/xor.h5
-rw-r--r--arch/x86/kernel/cpu/intel.c23
-rw-r--r--arch/x86/kernel/process.c2
-rw-r--r--arch/x86/kernel/setup.c1
-rw-r--r--arch/x86/kernel/stacktrace.c7
-rw-r--r--arch/x86/kernel/traps.c5
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/fault.c18
-rw-r--r--arch/x86/mm/init.c15
-rw-r--r--arch/x86/mm/init_32.c30
-rw-r--r--arch/x86/mm/init_64.c37
-rw-r--r--arch/x86/mm/kmemcheck/Makefile1
-rw-r--r--arch/x86/mm/kmemcheck/error.c228
-rw-r--r--arch/x86/mm/kmemcheck/error.h15
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c637
-rw-r--r--arch/x86/mm/kmemcheck/opcode.c106
-rw-r--r--arch/x86/mm/kmemcheck/opcode.h9
-rw-r--r--arch/x86/mm/kmemcheck/pte.c22
-rw-r--r--arch/x86/mm/kmemcheck/pte.h10
-rw-r--r--arch/x86/mm/kmemcheck/selftest.c69
-rw-r--r--arch/x86/mm/kmemcheck/selftest.h6
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c162
-rw-r--r--arch/x86/mm/kmemcheck/shadow.h16
-rw-r--r--arch/x86/mm/pageattr.c2
-rw-r--r--arch/x86/mm/pgtable.c12
-rw-r--r--crypto/xor.c7
-rw-r--r--drivers/ieee1394/csr1212.c2
-rw-r--r--drivers/ieee1394/nodemgr.c5
-rw-r--r--drivers/misc/c2port/core.c2
-rw-r--r--include/linux/c2port.h3
-rw-r--r--include/linux/fs.h5
-rw-r--r--include/linux/gfp.h14
-rw-r--r--include/linux/interrupt.h14
-rw-r--r--include/linux/kmemcheck.h157
-rw-r--r--include/linux/mm_types.h8
-rw-r--r--include/linux/ring_buffer.h4
-rw-r--r--include/linux/skbuff.h7
-rw-r--r--include/linux/slab.h7
-rw-r--r--include/linux/slab_def.h81
-rw-r--r--include/linux/stacktrace.h3
-rw-r--r--include/net/inet_sock.h14
-rw-r--r--include/net/inet_timewait_sock.h5
-rw-r--r--include/net/sock.h2
-rw-r--r--init/do_mounts.c3
-rw-r--r--init/main.c4
-rw-r--r--kernel/fork.c14
-rw-r--r--kernel/signal.c11
-rw-r--r--kernel/softirq.c11
-rw-r--r--kernel/sysctl.c11
-rw-r--r--kernel/trace/ring_buffer.c3
-rw-r--r--lib/Kconfig.debug6
-rw-r--r--lib/Kconfig.kmemcheck91
-rw-r--r--mm/Makefile1
-rw-r--r--mm/kmemcheck.c122
-rw-r--r--mm/page_alloc.c18
-rw-r--r--mm/slab.c108
-rw-r--r--mm/slub.c37
-rw-r--r--net/core/skbuff.c8
-rw-r--r--net/core/sock.c2
-rw-r--r--net/ipv4/inet_timewait_sock.c3
73 files changed, 2313 insertions, 156 deletions
diff --git a/Documentation/kmemcheck.txt b/Documentation/kmemcheck.txt
new file mode 100644
index 000000000000..a848d499811a
--- /dev/null
+++ b/Documentation/kmemcheck.txt
@@ -0,0 +1,129 @@
+Contents
+========
+
+ 1. How to use
+ 2. Technical description
+ 3. Changes to the slab allocators
+ 4. Problems
+ 5. Parameters
+ 6. Future enhancements
+
+
+How to use (IMPORTANT)
+======================
+
+Always remember this: kmemcheck _will_ give false positives. So don't enable
+it and spam the mailing list with its reports; you are not going to be heard,
+and it will make people's skins thicker for when the real errors are found.
+
+Instead, I encourage maintainers and developers to find errors in _their_
+_own_ code. And if you find false positives, you can try to work around them,
+try to figure out if it's a real bug or not, or simply ignore them. Most
+developers know their own code and will quickly and efficiently determine the
+root cause of a kmemcheck report. This is therefore also the most efficient
+way to work with kmemcheck.
+
+If you still want to run kmemcheck to inspect others' code, the rule of thumb
+should be: If it's not obvious (to you), don't tell us about it either. Most
+likely the code is correct and you'll only waste our time. If you can work
+out the error, please do send the maintainer a heads up and/or a patch, but
+don't expect him/her to fix something that wasn't wrong in the first place.
+
+
+Technical description
+=====================
+
+kmemcheck works by marking memory pages non-present. This means that whenever
+somebody attempts to access the page, a page fault is generated. The page
+fault handler notices that the page was in fact only hidden, and so it calls
+on the kmemcheck code to make further investigations.
+
+When the investigations are completed, kmemcheck "shows" the page by marking
+it present (as it would be under normal circumstances). This way, the
+interrupted code can continue as usual.
+
+But after the instruction has been executed, we should hide the page again, so
+that we can catch the next access too! Now kmemcheck makes use of a debugging
+feature of the processor, namely single-stepping. When the processor has
+finished the one instruction that generated the memory access, a debug
+exception is raised. From here, we simply hide the page again and continue
+execution, this time with the single-stepping feature turned off.
+
+
+Changes to the slab allocators
+==============================
+
+kmemcheck requires some assistance from the memory allocator in order to work.
+The memory allocator needs to
+
+1. Tell kmemcheck about newly allocated pages and pages that are about to
+ be freed. This allows kmemcheck to set up and tear down the shadow memory
+ for the pages in question. The shadow memory stores the status of each byte
+ in the allocation proper, e.g. whether it is initialized or uninitialized.
+2. Tell kmemcheck which parts of memory should be marked uninitialized. There
+ are actually a few more states, such as "not yet allocated" and "recently
+ freed".
+
+If a slab cache is set up using the SLAB_NOTRACK flag, it will never return
+memory that can take page faults because of kmemcheck.
+
+If a slab cache is NOT set up using the SLAB_NOTRACK flag, callers can still
+request memory with the __GFP_NOTRACK flag. This does not prevent the page
+faults from occurring, however, but marks the object in question as being
+initialized so that no warnings will ever be produced for this object.
+
+Currently, the SLAB and SLUB allocators are supported by kmemcheck.
+
+
+Problems
+========
+
+The most prominent problem seems to be that of bit-fields. kmemcheck can only
+track memory with byte granularity. Therefore, when gcc generates code to
+access only one bit in a bit-field, there is really no way for kmemcheck to
+know which of the other bits will be used or thrown away. Consequently, there
+may be bogus warnings for bit-field accesses. We have added a "bitfields" API
+to get around this problem. See include/linux/kmemcheck.h for detailed
+instructions!
+
+
+Parameters
+==========
+
+In addition to enabling CONFIG_KMEMCHECK before the kernel is compiled, the
+parameter kmemcheck=1 must be passed to the kernel when it is started in order
+to actually do the tracking. So by default, there is only a very small
+(probably negligible) overhead for enabling the config option.
+
+Similarly, kmemcheck may be turned on or off at run-time using, respectively:
+
+echo 1 > /proc/sys/kernel/kmemcheck
+ and
+echo 0 > /proc/sys/kernel/kmemcheck
+
+Note that this is a lazy setting; once turned off, the old allocations will
+still have to take a single page fault exception before tracking is turned off
+for that particular page. Enabling kmemcheck on will only enable tracking for
+allocations made from that point onwards.
+
+The default mode is the one-shot mode, where only the first error is reported
+before kmemcheck is disabled. This mode can be enabled by passing kmemcheck=2
+to the kernel at boot, or running
+
+echo 2 > /proc/sys/kernel/kmemcheck
+
+when the kernel is already running.
+
+
+Future enhancements
+===================
+
+There is already some preliminary support for catching use-after-free errors.
+What still needs to be done is delaying kfree() so that memory is not
+reallocated immediately after freeing it. [Suggested by Pekka Enberg.]
+
+It should be possible to allow SMP systems by duplicating the page tables for
+each processor in the system. This is probably extremely difficult, however.
+[Suggested by Ingo Molnar.]
+
+Support for instruction set extensions like XMM, SSE2, etc.
diff --git a/MAINTAINERS b/MAINTAINERS
index b7b395f60a0d..ffef93e0097a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3391,6 +3391,14 @@ F: drivers/serial/kgdboc.c
F: include/linux/kgdb.h
F: kernel/kgdb.c
+KMEMCHECK
+P: Vegard Nossum
+M: vegardno@ifi.uio.no
+P Pekka Enberg
+M: penberg@cs.helsinki.fi
+L: linux-kernel@vger.kernel.org
+S: Maintained
+
KMEMTRACE
P: Eduard - Gabriel Munteanu
M: eduard.munteanu@linux360.ro
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2d2cdbd99571..6e645bb76bcc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -46,6 +46,7 @@ config X86
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_LZMA
+ select HAVE_ARCH_KMEMCHECK
config OUTPUT_FORMAT
string
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index d105f29bb6bb..d8e7a8c848a8 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -66,6 +66,7 @@ config DEBUG_STACKOVERFLOW
config DEBUG_STACK_USAGE
bool "Stack utilization instrumentation"
depends on DEBUG_KERNEL
+ depends on !KMEMCHECK
---help---
Enables the display of the minimum amount of free stack which each
task has ever had available in the sysrq-T and sysrq-P debug output.
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index edbd0ca62067..1b68659c41b4 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -81,6 +81,11 @@ ifdef CONFIG_CC_STACKPROTECTOR
endif
endif
+# Don't unroll struct assignments with kmemcheck enabled
+ifeq ($(CONFIG_KMEMCHECK),y)
+ KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy)
+endif
+
# Stackpointer is addressed different for 32 bit and 64 bit x86
sp-$(CONFIG_X86_32) := esp
sp-$(CONFIG_X86_64) := rsp
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index f82fdc412c64..916cbb6c3c0e 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -6,6 +6,7 @@
* Documentation/DMA-API.txt for documentation.
*/
+#include <linux/kmemcheck.h>
#include <linux/scatterlist.h>
#include <linux/dma-debug.h>
#include <linux/dma-attrs.h>
@@ -59,6 +60,7 @@ dma_map_single(struct device *hwdev, void *ptr, size_t size,
struct dma_map_ops *ops = get_dma_ops(hwdev);
dma_addr_t addr;
+ kmemcheck_mark_initialized(ptr, size);
BUG_ON(!valid_dma_direction(dir));
addr = ops->map_page(hwdev, virt_to_page(ptr),
(unsigned long)ptr & ~PAGE_MASK, size,
@@ -88,6 +90,11 @@ dma_map_sg(struct device *hwdev, struct scatterlist *sg,
struct dma_map_ops *ops = get_dma_ops(hwdev);
int ents;
+ struct scatterlist *s;
+ int i;
+
+ for_each_sg(sg, s, nents, i)
+ kmemcheck_mark_initialized(sg_virt(s), s->length);
BUG_ON(!valid_dma_direction(dir));
ents = ops->map_sg(hwdev, sg, nents, dir, NULL);
debug_dma_map_sg(hwdev, sg, nents, ents, dir);
@@ -199,6 +206,7 @@ static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
struct dma_map_ops *ops = get_dma_ops(dev);
dma_addr_t addr;
+ kmemcheck_mark_initialized(page_address(page) + offset, size);
BUG_ON(!valid_dma_direction(dir));
addr = ops->map_page(dev, page, offset, size, dir, NULL);
debug_dma_map_page(dev, page, offset, size, dir, addr, false);
diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h
new file mode 100644
index 000000000000..ed01518f297e
--- /dev/null
+++ b/arch/x86/include/asm/kmemcheck.h
@@ -0,0 +1,42 @@
+#ifndef ASM_X86_KMEMCHECK_H
+#define ASM_X86_KMEMCHECK_H
+
+#include <linux/types.h>
+#include <asm/ptrace.h>
+
+#ifdef CONFIG_KMEMCHECK
+bool kmemcheck_active(struct pt_regs *regs);
+
+void kmemcheck_show(struct pt_regs *regs);
+void kmemcheck_hide(struct pt_regs *regs);
+
+bool kmemcheck_fault(struct pt_regs *regs,
+ unsigned long address, unsigned long error_code);
+bool kmemcheck_trap(struct pt_regs *regs);
+#else
+static inline bool kmemcheck_active(struct pt_regs *regs)
+{
+ return false;
+}
+
+static inline void kmemcheck_show(struct pt_regs *regs)
+{
+}
+
+static inline void kmemcheck_hide(struct pt_regs *regs)
+{
+}
+
+static inline bool kmemcheck_fault(struct pt_regs *regs,
+ unsigned long address, unsigned long error_code)
+{
+ return false;
+}
+
+static inline bool kmemcheck_trap(struct pt_regs *regs)
+{
+ return false;
+}
+#endif /* CONFIG_KMEMCHECK */
+
+#endif
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 6473f5ccff85..25e94a4412c4 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -40,6 +40,14 @@
#ifndef __ASSEMBLY__
+enum bootmem_state {
+ BEFORE_BOOTMEM,
+ DURING_BOOTMEM,
+ AFTER_BOOTMEM
+};
+
+extern enum bootmem_state bootmem_state;
+
extern int page_is_ram(unsigned long pagenr);
extern int devmem_is_allowed(unsigned long pagenr);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 18ef7ebf2631..577485ba3306 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -478,6 +478,11 @@ static inline int pgd_none(pgd_t pgd)
}
#endif /* PAGETABLE_LEVELS > 3 */
+static inline int pte_hidden(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_HIDDEN;
+}
+
#endif /* __ASSEMBLY__ */
/*
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 4d258ad76a0f..54cb697f4900 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -18,7 +18,7 @@
#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */
-#define _PAGE_BIT_UNUSED3 11
+#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
@@ -41,13 +41,18 @@
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
-#define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3)
#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
#define __HAVE_ARCH_PTE_SPECIAL
+#ifdef CONFIG_KMEMCHECK
+#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
+#else
+#define _PAGE_HIDDEN (_AT(pteval_t, 0))
+#endif
+
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
#else
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index 0e0e3ba827f7..c86f452256de 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -177,10 +177,18 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
* No 3D Now!
*/
+#ifndef CONFIG_KMEMCHECK
#define memcpy(t, f, n) \
(__builtin_constant_p((n)) \
? __constant_memcpy((t), (f), (n)) \
: __memcpy((t), (f), (n)))
+#else
+/*
+ * kmemcheck becomes very happy if we use the REP instructions unconditionally,
+ * because it means that we know both memory operands in advance.
+ */
+#define memcpy(t, f, n) __memcpy((t), (f), (n))
+#endif
#endif
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 2afe164bf1e6..19e2c468fc2c 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -27,6 +27,7 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t
function. */
#define __HAVE_ARCH_MEMCPY 1
+#ifndef CONFIG_KMEMCHECK
#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4
extern void *memcpy(void *to, const void *from, size_t len);
#else
@@ -42,6 +43,13 @@ extern void *__memcpy(void *to, const void *from, size_t len);
__ret; \
})
#endif
+#else
+/*
+ * kmemcheck becomes very happy if we use the REP instructions unconditionally,
+ * because it means that we know both memory operands in advance.
+ */
+#define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len))
+#endif
#define __HAVE_ARCH_MEMSET
void *memset(void *s, int c, size_t n);
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 602c769fc98c..b0783520988b 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -154,9 +154,9 @@ struct thread_info {
/* thread information allocation */
#ifdef CONFIG_DEBUG_STACK_USAGE
-#define THREAD_FLAGS (GFP_KERNEL | __GFP_ZERO)
+#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
#else
-#define THREAD_FLAGS GFP_KERNEL
+#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK)
#endif
#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index 11b3bb86e17b..7fcf6f3dbcc3 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -1,5 +1,10 @@
+#ifdef CONFIG_KMEMCHECK
+/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
+# include <asm-generic/xor.h>
+#else
#ifdef CONFIG_X86_32
# include "xor_32.h"
#else
# include "xor_64.h"
#endif
+#endif
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index daed39ba2614..3260ab044996 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -86,6 +86,29 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
*/
if (c->x86 == 6 && c->x86_model < 15)
clear_cpu_cap(c, X86_FEATURE_PAT);
+
+#ifdef CONFIG_KMEMCHECK
+ /*
+ * P4s have a "fast strings" feature which causes single-
+ * stepping REP instructions to only generate a #DB on
+ * cache-line boundaries.
+ *
+ * Ingo Molnar reported a Pentium D (model 6) and a Xeon
+ * (model 2) with the same problem.
+ */
+ if (c->x86 == 15) {
+ u64 misc_enable;
+
+ rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+
+ if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
+ printk(KERN_INFO "kmemcheck: Disabling fast string operations\n");
+
+ misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
+ wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+ }
+ }
+#endif
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3bb2be1649bd..994dd6a4a2a0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -63,7 +63,7 @@ void arch_task_cache_init(void)
task_xstate_cachep =
kmem_cache_create("task_xstate", xstate_size,
__alignof__(union thread_xstate),
- SLAB_PANIC, NULL);
+ SLAB_PANIC | SLAB_NOTRACK, NULL);
}
/*
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 049ad2a42756..4a5b3dbcf3e3 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -916,6 +916,7 @@ void __init setup_arch(char **cmdline_p)
#endif
initmem_init(0, max_pfn);
+ bootmem_state = DURING_BOOTMEM;
#ifdef CONFIG_ACPI_SLEEP
/*
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 4aaf7e48394f..c3eb207181fe 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -77,6 +77,13 @@ void save_stack_trace(struct stack_trace *trace)
}
EXPORT_SYMBOL_GPL(save_stack_trace);
+void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp)
+{
+ dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace);
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
+
void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
{
dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 9e6e57d307ed..5fcc1437fea4 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -46,6 +46,7 @@
#endif
#include <asm/stacktrace.h>
+#include <asm/kmemcheck.h>
#include <asm/processor.h>
#include <asm/debugreg.h>
#include <asm/atomic.h>
@@ -534,6 +535,10 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
get_debugreg(condition, 6);
+ /* Catch kmemcheck conditions first of all! */
+ if (condition & DR_STEP && kmemcheck_trap(regs))
+ return;
+
/*
* The processor cleared BTF, so don't mark that we need it set.
*/
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index fdd30d08ab52..eefdeee8a871 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -10,6 +10,8 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
obj-$(CONFIG_HIGHMEM) += highmem_32.o
+obj-$(CONFIG_KMEMCHECK) += kmemcheck/
+
obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 5ec7ae366615..507bc9503235 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -13,6 +13,7 @@
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
+#include <asm/kmemcheck.h>
/*
* Page fault error code bits:
@@ -955,6 +956,13 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
/* Get the faulting address: */
address = read_cr2();
+ /*
+ * Detect and handle instructions that would cause a page fault for
+ * both a tracked kernel page and a userspace page.
+ */
+ if (kmemcheck_active(regs))
+ kmemcheck_hide(regs);
+
if (unlikely(kmmio_fault(regs, address)))
return;
@@ -972,9 +980,13 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
* protection error (error_code & 9) == 0.
*/
if (unlikely(fault_in_kernel_space(address))) {
- if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
- vmalloc_fault(address) >= 0)
- return;
+ if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
+ if (vmalloc_fault(address) >= 0)
+ return;
+
+ if (kmemcheck_fault(regs, address, error_code))
+ return;
+ }
/* Can handle a stale RO->RW TLB: */
if (spurious_fault(error_code, address))
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 34c1bfb64f1c..104359086680 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -19,7 +19,7 @@ unsigned long __initdata e820_table_start;
unsigned long __meminitdata e820_table_end;
unsigned long __meminitdata e820_table_top;
-int after_bootmem;
+enum bootmem_state bootmem_state = BEFORE_BOOTMEM;
int direct_gbpages
#ifdef CONFIG_DIRECT_GBPAGES
@@ -210,10 +210,10 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
- if (!after_bootmem)
+ if (bootmem_state == BEFORE_BOOTMEM)
init_gbpages();
-#ifdef CONFIG_DEBUG_PAGEALLOC
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
/*
* For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
* This will simplify cpa(), which otherwise needs to support splitting
@@ -346,7 +346,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
* memory mapped. Unfortunately this is done currently before the
* nodes are discovered.
*/
- if (!after_bootmem)
+ if (bootmem_state == BEFORE_BOOTMEM)
find_early_table_space(end, use_pse, use_gbpages);
#ifdef CONFIG_X86_32
@@ -367,7 +367,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
#endif
#ifdef CONFIG_X86_64
- if (!after_bootmem && !start) {
+ if (bootmem_state == BEFORE_BOOTMEM && !start) {
pud_t *pud;
pmd_t *pmd;
@@ -387,11 +387,12 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
#endif
__flush_tlb_all();
- if (!after_bootmem && e820_table_end > e820_table_start)
+ if (bootmem_state == BEFORE_BOOTMEM &&
+ e820_table_end > e820_table_start)
reserve_early(e820_table_start << PAGE_SHIFT,
e820_table_end << PAGE_SHIFT, "PGTABLE");
- if (!after_bootmem)
+ if (bootmem_state == BEFORE_BOOTMEM)
early_memtest(start, end);
return ret >> PAGE_SHIFT;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 949708d7a481..5da919b9b601 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -79,14 +79,20 @@ static __init void *alloc_low_page(void)
static pmd_t * __init one_md_table_init(pgd_t *pgd)
{
pud_t *pud;
- pmd_t *pmd_table;
+ pmd_t *pmd_table = NULL;
#ifdef CONFIG_X86_PAE
if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
- if (after_bootmem)
+ switch (bootmem_state) {
+ case DURING_BOOTMEM:
pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
- else
+ break;
+ case BEFORE_BOOTMEM:
pmd_table = (pmd_t *)alloc_low_page();
+ break;
+ default:
+ panic("after bootmem call one_md_table_init\n");
+ }
paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
pud = pud_offset(pgd, 0);
@@ -110,15 +116,21 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
pte_t *page_table = NULL;
- if (after_bootmem) {
-#ifdef CONFIG_DEBUG_PAGEALLOC
+ switch (bootmem_state) {
+ case DURING_BOOTMEM:
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
#endif
if (!page_table)
page_table =
(pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
- } else
+ break;
+ case BEFORE_BOOTMEM:
page_table = (pte_t *)alloc_low_page();
+ break;
+ default:
+ panic("after bootmem call one_page_table_init\n");
+ }
paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
@@ -166,7 +178,7 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
pte_t *newpte;
int i;
- BUG_ON(after_bootmem);
+ BUG_ON(bootmem_state != BEFORE_BOOTMEM);
newpte = alloc_low_page();
for (i = 0; i < PTRS_PER_PTE; i++)
set_pte(newpte + i, pte[i]);
@@ -800,8 +812,6 @@ void __init setup_bootmem_allocator(void)
bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
bootmap);
}
-
- after_bootmem = 1;
}
/*
@@ -868,6 +878,8 @@ void __init mem_init(void)
/* this will put all low memory onto the freelists */
totalram_pages += free_all_bootmem();
+ bootmem_state = AFTER_BOOTMEM;
+
reservedpages = 0;
for (tmp = 0; tmp < max_low_pfn; tmp++)
/*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 52bb9519bb86..859a73b0c5c3 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -97,20 +97,26 @@ __setup("noexec32=", nonx32_setup);
/*
* NOTE: This function is marked __ref because it calls __init function
- * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
+ * (alloc_bootmem_pages). It's safe to do it ONLY when DURING_BOOTMEM.
*/
static __ref void *spp_getpage(void)
{
- void *ptr;
+ void *ptr = NULL;
- if (after_bootmem)
- ptr = (void *) get_zeroed_page(GFP_ATOMIC);
- else
+ switch (bootmem_state) {
+ case AFTER_BOOTMEM:
+ ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
+ break;
+ case DURING_BOOTMEM:
ptr = alloc_bootmem_pages(PAGE_SIZE);
+ break;
+ default:
+ panic("calling spp_getpage before bootmem\n");
+ }
if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
panic("set_pte_phys: cannot allocate page data %s\n",
- after_bootmem ? "after bootmem" : "");
+ bootmem_state == AFTER_BOOTMEM ? "after bootmem" : "");
}
pr_debug("spp_getpage %p\n", ptr);
@@ -277,16 +283,17 @@ void __init cleanup_highmap(void)
static __ref void *alloc_low_page(unsigned long *phys)
{
- unsigned long pfn = e820_table_end++;
+ unsigned long pfn;
void *adr;
- if (after_bootmem) {
- adr = (void *)get_zeroed_page(GFP_ATOMIC);
+ if (bootmem_state == AFTER_BOOTMEM) {
+ adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
*phys = __pa(adr);
return adr;
}
+ pfn = e820_table_end++;
if (pfn >= e820_table_top)
panic("alloc_low_page: ran out of memory");
@@ -298,7 +305,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
static __ref void unmap_low_page(void *adr)
{
- if (after_bootmem)
+ if (bootmem_state == AFTER_BOOTMEM)
return;
early_iounmap(adr, PAGE_SIZE);
@@ -317,7 +324,7 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
if (addr >= end) {
- if (!after_bootmem) {
+ if (bootmem_state != AFTER_BOOTMEM) {
for(; i < PTRS_PER_PTE; i++, pte++)
set_pte(pte, __pte(0));
}
@@ -373,7 +380,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
pgprot_t new_prot = prot;
if (address >= end) {
- if (!after_bootmem) {
+ if (bootmem_state != AFTER_BOOTMEM) {
for (; i < PTRS_PER_PMD; i++, pmd++)
set_pmd(pmd, __pmd(0));
}
@@ -459,7 +466,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
if (addr >= end)
break;
- if (!after_bootmem &&
+ if (bootmem_state != AFTER_BOOTMEM &&
!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
set_pud(pud, __pud(0));
continue;
@@ -650,8 +657,6 @@ void __init mem_init(void)
/* clear_bss() already clear the empty_zero_page */
- reservedpages = 0;
-
/* this will put all low memory onto the freelists */
#ifdef CONFIG_NUMA
totalram_pages = numa_free_all_bootmem();
@@ -659,9 +664,9 @@ void __init mem_init(void)
totalram_pages = free_all_bootmem();
#endif
+ bootmem_state = AFTER_BOOTMEM;
absent_pages = absent_pages_in_range(0, max_pfn);
reservedpages = max_pfn - totalram_pages - absent_pages;
- after_bootmem = 1;
codesize = (unsigned long) &_etext - (unsigned long) &_text;
datasize = (unsigned long) &_edata - (unsigned long) &_etext;
diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile
new file mode 100644
index 000000000000..520b3bce4095
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/Makefile
@@ -0,0 +1 @@
+obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
new file mode 100644
index 000000000000..4901d0dafda6
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -0,0 +1,228 @@
+#include <linux/interrupt.h>
+#include <linux/kdebug.h>
+#include <linux/kmemcheck.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+
+#include "error.h"
+#include "shadow.h"
+
+enum kmemcheck_error_type {
+ KMEMCHECK_ERROR_INVALID_ACCESS,
+ KMEMCHECK_ERROR_BUG,
+};
+
+#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT)
+
+struct kmemcheck_error {
+ enum kmemcheck_error_type type;
+
+ union {
+ /* KMEMCHECK_ERROR_INVALID_ACCESS */
+ struct {
+ /* Kind of access that caused the error */
+ enum kmemcheck_shadow state;
+ /* Address and size of the erroneous read */
+ unsigned long address;
+ unsigned int size;
+ };
+ };
+
+ struct pt_regs regs;
+ struct stack_trace trace;
+ unsigned long trace_entries[32];
+
+ /* We compress it to a char. */
+ unsigned char shadow_copy[SHADOW_COPY_SIZE];
+ unsigned char memory_copy[SHADOW_COPY_SIZE];
+};
+
+/*
+ * Create a ring queue of errors to output. We can't call printk() directly
+ * from the kmemcheck traps, since this may call the console drivers and
+ * result in a recursive fault.
+ */
+static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE];
+static unsigned int error_count;
+static unsigned int error_rd;
+static unsigned int error_wr;
+static unsigned int error_missed_count;
+
+static struct kmemcheck_error *error_next_wr(void)
+{
+ struct kmemcheck_error *e;
+
+ if (error_count == ARRAY_SIZE(error_fifo)) {
+ ++error_missed_count;
+ return NULL;
+ }
+
+ e = &error_fifo[error_wr];
+ if (++error_wr == ARRAY_SIZE(error_fifo))
+ error_wr = 0;
+ ++error_count;
+ return e;
+}
+
+static struct kmemcheck_error *error_next_rd(void)
+{
+ struct kmemcheck_error *e;
+
+ if (error_count == 0)
+ return NULL;
+
+ e = &error_fifo[error_rd];
+ if (++error_rd == ARRAY_SIZE(error_fifo))
+ error_rd = 0;
+ --error_count;
+ return e;
+}
+
+void kmemcheck_error_recall(void)
+{
+ static const char *desc[] = {
+ [KMEMCHECK_SHADOW_UNALLOCATED] = "unallocated",
+ [KMEMCHECK_SHADOW_UNINITIALIZED] = "uninitialized",
+ [KMEMCHECK_SHADOW_INITIALIZED] = "initialized",
+ [KMEMCHECK_SHADOW_FREED] = "freed",
+ };
+
+ static const char short_desc[] = {
+ [KMEMCHECK_SHADOW_UNALLOCATED] = 'a',
+ [KMEMCHECK_SHADOW_UNINITIALIZED] = 'u',
+ [KMEMCHECK_SHADOW_INITIALIZED] = 'i',
+ [KMEMCHECK_SHADOW_FREED] = 'f',
+ };
+
+ struct kmemcheck_error *e;
+ unsigned int i;
+
+ e = error_next_rd();
+ if (!e)
+ return;
+
+ switch (e->type) {
+ case KMEMCHECK_ERROR_INVALID_ACCESS:
+ printk(KERN_ERR "WARNING: kmemcheck: Caught %d-bit read "
+ "from %s memory (%p)\n",
+ 8 * e->size, e->state < ARRAY_SIZE(desc) ?
+ desc[e->state] : "(invalid shadow state)",
+ (void *) e->address);
+
+ printk(KERN_INFO);
+ for (i = 0; i < SHADOW_COPY_SIZE; ++i)
+ printk("%02x", e->memory_copy[i]);
+ printk("\n");
+
+ printk(KERN_INFO);
+ for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
+ if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
+ printk(" %c", short_desc[e->shadow_copy[i]]);
+ else
+ printk(" ?");
+ }
+ printk("\n");
+ printk(KERN_INFO "%*c\n", 2 + 2
+ * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
+ break;
+ case KMEMCHECK_ERROR_BUG:
+ printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n");
+ break;
+ }
+
+ __show_regs(&e->regs, 1);
+ print_stack_trace(&e->trace, 0);
+}
+
+static void do_wakeup(unsigned long data)
+{
+ while (error_count > 0)
+ kmemcheck_error_recall();
+
+ if (error_missed_count > 0) {
+ printk(KERN_WARNING "kmemcheck: Lost %d error reports because "
+ "the queue was too small\n", error_missed_count);
+ error_missed_count = 0;
+ }
+}
+
+static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0);
+
+/*
+ * Save the context of an error report.
+ */
+void kmemcheck_error_save(enum kmemcheck_shadow state,
+ unsigned long address, unsigned int size, struct pt_regs *regs)
+{
+ static unsigned long prev_ip;
+
+ struct kmemcheck_error *e;
+ void *shadow_copy;
+ void *memory_copy;
+
+ /* Don't report several adjacent errors from the same EIP. */
+ if (regs->ip == prev_ip)
+ return;
+ prev_ip = regs->ip;
+
+ e = error_next_wr();
+ if (!e)
+ return;
+
+ e->type = KMEMCHECK_ERROR_INVALID_ACCESS;
+
+ e->state = state;
+ e->address = address;
+ e->size = size;
+
+ /* Save regs */
+ memcpy(&e->regs, regs, sizeof(*regs));
+
+ /* Save stack trace */
+ e->trace.nr_entries = 0;
+ e->trace.entries = e->trace_entries;
+ e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
+ e->trace.skip = 0;
+ save_stack_trace_bp(&e->trace, regs->bp);
+
+ /* Round address down to nearest 16 bytes */
+ shadow_copy = kmemcheck_shadow_lookup(address
+ & ~(SHADOW_COPY_SIZE - 1));
+ BUG_ON(!shadow_copy);
+
+ memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE);
+
+ kmemcheck_show_addr(address);
+ memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1));
+ memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE);
+ kmemcheck_hide_addr(address);
+
+ tasklet_hi_schedule_first(&kmemcheck_tasklet);
+}
+
+/*
+ * Save the context of a kmemcheck bug.
+ */
+void kmemcheck_error_save_bug(struct pt_regs *regs)
+{
+ struct kmemcheck_error *e;
+
+ e = error_next_wr();
+ if (!e)
+ return;
+
+ e->type = KMEMCHECK_ERROR_BUG;
+
+ memcpy(&e->regs, regs, sizeof(*regs));
+
+ e->trace.nr_entries = 0;
+ e->trace.entries = e->trace_entries;
+ e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
+ e->trace.skip = 1;
+ save_stack_trace(&e->trace);
+
+ tasklet_hi_schedule_first(&kmemcheck_tasklet);
+}
diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h
new file mode 100644
index 000000000000..0efc2e8d0a20
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/error.h
@@ -0,0 +1,15 @@
+#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H
+#define ARCH__X86__MM__KMEMCHECK__ERROR_H
+
+#include <linux/ptrace.h>
+
+#include "shadow.h"
+
+void kmemcheck_error_save(enum kmemcheck_shadow state,
+ unsigned long address, unsigned int size, struct pt_regs *regs);
+
+void kmemcheck_error_save_bug(struct pt_regs *regs);
+
+void kmemcheck_error_recall(void);
+
+#endif
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
new file mode 100644
index 000000000000..2b5ca659758f
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -0,0 +1,637 @@
+/**
+ * kmemcheck - a heavyweight memory checker for the linux kernel
+ * Copyright (C) 2007, 2008 Vegard Nossum <vegardno@ifi.uio.no>
+ * (With a lot of help from Ingo Molnar and Pekka Enberg.)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2) as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/kernel.h>
+#include <linux/kmemcheck.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/page-flags.h>
+#include <linux/percpu.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include <asm/cacheflush.h>
+#include <asm/kmemcheck.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+
+#include "error.h"
+#include "opcode.h"
+#include "pte.h"
+#include "selftest.h"
+#include "shadow.h"
+
+
+#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT
+# define KMEMCHECK_ENABLED 0
+#endif
+
+#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT
+# define KMEMCHECK_ENABLED 1
+#endif
+
+#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT
+# define KMEMCHECK_ENABLED 2
+#endif
+
+int kmemcheck_enabled = KMEMCHECK_ENABLED;
+
+void __init kmemcheck_init(void)
+{
+#ifdef CONFIG_SMP
+ /*
+ * Limit SMP to use a single CPU. We rely on the fact that this code
+ * runs before SMP is set up.
+ */
+ if (setup_max_cpus > 1) {
+ printk(KERN_INFO
+ "kmemcheck: Limiting number of CPUs to 1.\n");
+ setup_max_cpus = 1;
+ }
+#endif
+
+ if (!kmemcheck_selftest()) {
+ printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n");
+ kmemcheck_enabled = 0;
+ return;
+ }
+
+ printk(KERN_INFO "kmemcheck: Initialized\n");
+}
+
+/*
+ * We need to parse the kmemcheck= option before any memory is allocated.
+ */
+static int __init param_kmemcheck(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ sscanf(str, "%d", &kmemcheck_enabled);
+ return 0;
+}
+
+early_param("kmemcheck", param_kmemcheck);
+
+int kmemcheck_show_addr(unsigned long address)
+{
+ pte_t *pte;
+
+ pte = kmemcheck_pte_lookup(address);
+ if (!pte)
+ return 0;
+
+ set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
+ __flush_tlb_one(address);
+ return 1;
+}
+
+int kmemcheck_hide_addr(unsigned long address)
+{
+ pte_t *pte;
+
+ pte = kmemcheck_pte_lookup(address);
+ if (!pte)
+ return 0;
+
+ set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
+ __flush_tlb_one(address);
+ return 1;
+}
+
+struct kmemcheck_context {
+ bool busy;
+ int balance;
+
+ /*
+ * There can be at most two memory operands to an instruction, but
+ * each address can cross a page boundary -- so we may need up to
+ * four addresses that must be hidden/revealed for each fault.
+ */
+ unsigned long addr[4];
+ unsigned long n_addrs;
+ unsigned long flags;
+
+ /* Data size of the instruction that caused a fault. */
+ unsigned int size;
+};
+
+static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context);
+
+bool kmemcheck_active(struct pt_regs *regs)
+{
+ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+
+ return data->balance > 0;
+}
+
+/* Save an address that needs to be shown/hidden */
+static void kmemcheck_save_addr(unsigned long addr)
+{
+ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+
+ BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr));
+ data->addr[data->n_addrs++] = addr;
+}
+
+static unsigned int kmemcheck_show_all(void)
+{
+ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+ unsigned int i;
+ unsigned int n;
+
+ n = 0;
+ for (i = 0; i < data->n_addrs; ++i)
+ n += kmemcheck_show_addr(data->addr[i]);
+
+ return n;
+}
+
+static unsigned int kmemcheck_hide_all(void)
+{
+ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+ unsigned int i;
+ unsigned int n;
+
+ n = 0;
+ for (i = 0; i < data->n_addrs; ++i)
+ n += kmemcheck_hide_addr(data->addr[i]);
+
+ return n;
+}
+
+/*
+ * Called from the #PF handler.
+ */
+void kmemcheck_show(struct pt_regs *regs)
+{
+ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+
+ BUG_ON(!irqs_disabled());
+
+ if (unlikely(data->balance != 0)) {
+ kmemcheck_show_all();
+ kmemcheck_error_save_bug(regs);
+ data->balance = 0;
+ return;
+ }
+
+ /*
+ * None of the addresses actually belonged to kmemcheck. Note that
+ * this is not an error.
+ */
+ if (kmemcheck_show_all() == 0)
+ return;
+
+ ++data->balance;
+
+ /*
+ * The IF needs to be cleared as well, so that the faulting
+ * instruction can run "uninterrupted". Otherwise, we might take
+ * an interrupt and start executing that before we've had a chance
+ * to hide the page again.
+ *
+ * NOTE: In the rare case of multiple faults, we must not override
+ * the original flags:
+ */
+ if (!(regs->flags & X86_EFLAGS_TF))
+ data->flags = regs->flags;
+
+ regs->flags |= X86_EFLAGS_TF;
+ regs->flags &= ~X86_EFLAGS_IF;
+}
+
+/*
+ * Called from the #DB handler.
+ */
+void kmemcheck_hide(struct pt_regs *regs)
+{
+ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+ int n;
+
+ BUG_ON(!irqs_disabled());
+
+ if (data->balance == 0)
+ return;
+
+ if (unlikely(data->balance != 1)) {
+ kmemcheck_show_all();
+ kmemcheck_error_save_bug(regs);
+ data->n_addrs = 0;
+ data->balance = 0;
+
+ if (!(data->flags & X86_EFLAGS_TF))
+ regs->flags &= ~X86_EFLAGS_TF;
+ if (data->flags & X86_EFLAGS_IF)
+ regs->flags |= X86_EFLAGS_IF;
+ return;
+ }
+
+ if (kmemcheck_enabled)
+ n = kmemcheck_hide_all();
+ else
+ n = kmemcheck_show_all();
+
+ if (n == 0)
+ return;
+
+ --data->balance;
+
+ data->n_addrs = 0;
+
+ if (!(data->flags & X86_EFLAGS_TF))
+ regs->flags &= ~X86_EFLAGS_TF;
+ if (data->flags & X86_EFLAGS_IF)
+ regs->flags |= X86_EFLAGS_IF;
+}
+
+void kmemcheck_show_pages(struct page *p, unsigned int n)
+{
+ unsigned int i;
+
+ for (i = 0; i < n; ++i) {
+ unsigned long address;
+ pte_t *pte;
+ unsigned int level;
+
+ address = (unsigned long) page_address(&p[i]);
+ pte = lookup_address(address, &level);
+ BUG_ON(!pte);
+ BUG_ON(level != PG_LEVEL_4K);
+
+ set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
+ set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN));
+ __flush_tlb_one(address);
+ }
+}
+
+bool kmemcheck_page_is_tracked(struct page *p)
+{
+ /* This will also check the "hidden" flag of the PTE. */
+ return kmemcheck_pte_lookup((unsigned long) page_address(p));
+}
+
+void kmemcheck_hide_pages(struct page *p, unsigned int n)
+{
+ unsigned int i;
+
+ for (i = 0; i < n; ++i) {
+ unsigned long address;
+ pte_t *pte;
+ unsigned int level;
+
+ address = (unsigned long) page_address(&p[i]);
+ pte = lookup_address(address, &level);
+ BUG_ON(!pte);
+ BUG_ON(level != PG_LEVEL_4K);
+
+ set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
+ set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN));
+ __flush_tlb_one(address);
+ }
+}
+
+/* Access may NOT cross page boundary */
+static void kmemcheck_read_strict(struct pt_regs *regs,
+ unsigned long addr, unsigned int size)
+{
+ void *shadow;
+ enum kmemcheck_shadow status;
+
+ shadow = kmemcheck_shadow_lookup(addr);
+ if (!shadow)
+ return;
+
+ kmemcheck_save_addr(addr);
+ status = kmemcheck_shadow_test(shadow, size);
+ if (status == KMEMCHECK_SHADOW_INITIALIZED)
+ return;
+
+ if (kmemcheck_enabled)
+ kmemcheck_error_save(status, addr, size, regs);
+
+ if (kmemcheck_enabled == 2)
+ kmemcheck_enabled = 0;
+
+ /* Don't warn about it again. */
+ kmemcheck_shadow_set(shadow, size);
+}
+
+/* Access may cross page boundary */
+static void kmemcheck_read(struct pt_regs *regs,
+ unsigned long addr, unsigned int size)
+{
+ unsigned long page = addr & PAGE_MASK;
+ unsigned long next_addr = addr + size - 1;
+ unsigned long next_page = next_addr & PAGE_MASK;
+
+ if (likely(page == next_page)) {
+ kmemcheck_read_strict(regs, addr, size);
+ return;
+ }
+
+ /*
+ * What we do is basically to split the access across the
+ * two pages and handle each part separately. Yes, this means
+ * that we may now see reads that are 3 + 5 bytes, for
+ * example (and if both are uninitialized, there will be two
+ * reports), but it makes the code a lot simpler.
+ */
+ kmemcheck_read_strict(regs, addr, next_page - addr);
+ kmemcheck_read_strict(regs, next_page, next_addr - next_page);
+}
+
+static void kmemcheck_write_strict(struct pt_regs *regs,
+ unsigned long addr, unsigned int size)
+{
+ void *shadow;
+
+ shadow = kmemcheck_shadow_lookup(addr);
+ if (!shadow)
+ return;
+
+ kmemcheck_save_addr(addr);
+ kmemcheck_shadow_set(shadow, size);
+}
+
+static void kmemcheck_write(struct pt_regs *regs,
+ unsigned long addr, unsigned int size)
+{
+ unsigned long page = addr & PAGE_MASK;
+ unsigned long next_addr = addr + size - 1;
+ unsigned long next_page = next_addr & PAGE_MASK;
+
+ if (likely(page == next_page)) {
+ kmemcheck_write_strict(regs, addr, size);
+ return;
+ }
+
+ /* See comment in kmemcheck_read(). */
+ kmemcheck_write_strict(regs, addr, next_page - addr);
+ kmemcheck_write_strict(regs, next_page, next_addr - next_page);
+}
+
+/*
+ * Copying is hard. We have two addresses, each of which may be split across
+ * a page (and each page will have different shadow addresses).
+ */
+static void kmemcheck_copy(struct pt_regs *regs,
+ unsigned long src_addr, unsigned long dst_addr, unsigned int size)
+{
+ uint8_t shadow[8];
+ enum kmemcheck_shadow status;
+
+ unsigned long page;
+ unsigned long next_addr;
+ unsigned long next_page;
+
+ uint8_t *x;
+ unsigned int i;
+ unsigned int n;
+
+ BUG_ON(size > sizeof(shadow));
+
+ page = src_addr & PAGE_MASK;
+ next_addr = src_addr + size - 1;
+ next_page = next_addr & PAGE_MASK;
+
+ if (likely(page == next_page)) {
+ /* Same page */
+ x = kmemcheck_shadow_lookup(src_addr);
+ if (x) {
+ kmemcheck_save_addr(src_addr);
+ for (i = 0; i < size; ++i)
+ shadow[i] = x[i];
+ } else {
+ for (i = 0; i < size; ++i)
+ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+ }
+ } else {
+ n = next_page - src_addr;
+ BUG_ON(n > sizeof(shadow));
+
+ /* First page */
+ x = kmemcheck_shadow_lookup(src_addr);
+ if (x) {
+ kmemcheck_save_addr(src_addr);
+ for (i = 0; i < n; ++i)
+ shadow[i] = x[i];
+ } else {
+ /* Not tracked */
+ for (i = 0; i < n; ++i)
+ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+ }
+
+ /* Second page */
+ x = kmemcheck_shadow_lookup(next_page);
+ if (x) {
+ kmemcheck_save_addr(next_page);
+ for (i = n; i < size; ++i)
+ shadow[i] = x[i - n];
+ } else {
+ /* Not tracked */
+ for (i = n; i < size; ++i)
+ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+ }
+ }
+
+ page = dst_addr & PAGE_MASK;
+ next_addr = dst_addr + size - 1;
+ next_page = next_addr & PAGE_MASK;
+
+ if (likely(page == next_page)) {
+ /* Same page */
+ x = kmemcheck_shadow_lookup(dst_addr);
+ if (x) {
+ kmemcheck_save_addr(dst_addr);
+ for (i = 0; i < size; ++i) {
+ x[i] = shadow[i];
+ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+ }
+ }
+ } else {
+ n = next_page - dst_addr;
+ BUG_ON(n > sizeof(shadow));
+
+ /* First page */
+ x = kmemcheck_shadow_lookup(dst_addr);
+ if (x) {
+ kmemcheck_save_addr(dst_addr);
+ for (i = 0; i < n; ++i) {
+ x[i] = shadow[i];
+ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+ }
+ }
+
+ /* Second page */
+ x = kmemcheck_shadow_lookup(next_page);
+ if (x) {
+ kmemcheck_save_addr(next_page);
+ for (i = n; i < size; ++i) {
+ x[i - n] = shadow[i];
+ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+ }
+ }
+ }
+
+ status = kmemcheck_shadow_test(shadow, size);
+ if (status == KMEMCHECK_SHADOW_INITIALIZED)
+ return;
+
+ if (kmemcheck_enabled)
+ kmemcheck_error_save(status, src_addr, size, regs);
+
+ if (kmemcheck_enabled == 2)
+ kmemcheck_enabled = 0;
+}
+
+enum kmemcheck_method {
+ KMEMCHECK_READ,
+ KMEMCHECK_WRITE,
+};
+
+static void kmemcheck_access(struct pt_regs *regs,
+ unsigned long fallback_address, enum kmemcheck_method fallback_method)
+{
+ const uint8_t *insn;
+ const uint8_t *insn_primary;
+ unsigned int size;
+
+ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+
+ /* Recursive fault -- ouch. */
+ if (data->busy) {
+ kmemcheck_show_addr(fallback_address);
+ kmemcheck_error_save_bug(regs);
+ return;
+ }
+
+ data->busy = true;
+
+ insn = (const uint8_t *) regs->ip;
+ insn_primary = kmemcheck_opcode_get_primary(insn);
+
+ kmemcheck_opcode_decode(insn, &size);
+
+ switch (insn_primary[0]) {
+#ifdef CONFIG_KMEMCHECK_BITOPS_OK
+ /* AND, OR, XOR */
+ /*
+ * Unfortunately, these instructions have to be excluded from
+ * our regular checking since they access only some (and not
+ * all) bits. This clears out "bogus" bitfield-access warnings.
+ */
+ case 0x80:
+ case 0x81:
+ case 0x82:
+ case 0x83:
+ switch ((insn_primary[1] >> 3) & 7) {
+ /* OR */
+ case 1:
+ /* AND */
+ case 4:
+ /* XOR */
+ case 6:
+ kmemcheck_write(regs, fallback_address, size);
+ goto out;
+
+ /* ADD */
+ case 0:
+ /* ADC */
+ case 2:
+ /* SBB */
+ case 3:
+ /* SUB */
+ case 5:
+ /* CMP */
+ case 7:
+ break;
+ }
+ break;
+#endif
+
+ /* MOVS, MOVSB, MOVSW, MOVSD */
+ case 0xa4:
+ case 0xa5:
+ /*
+ * These instructions are special because they take two
+ * addresses, but we only get one page fault.
+ */
+ kmemcheck_copy(regs, regs->si, regs->di, size);
+ goto out;
+
+ /* CMPS, CMPSB, CMPSW, CMPSD */
+ case 0xa6:
+ case 0xa7:
+ kmemcheck_read(regs, regs->si, size);
+ kmemcheck_read(regs, regs->di, size);
+ goto out;
+ }
+
+ /*
+ * If the opcode isn't special in any way, we use the data from the
+ * page fault handler to determine the address and type of memory
+ * access.
+ */
+ switch (fallback_method) {
+ case KMEMCHECK_READ:
+ kmemcheck_read(regs, fallback_address, size);
+ goto out;
+ case KMEMCHECK_WRITE:
+ kmemcheck_write(regs, fallback_address, size);
+ goto out;
+ }
+
+out:
+ data->busy = false;
+}
+
+bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
+ unsigned long error_code)
+{
+ pte_t *pte;
+
+ /*
+ * XXX: Is it safe to assume that memory accesses from virtual 86
+ * mode or non-kernel code segments will _never_ access kernel
+ * memory (e.g. tracked pages)? For now, we need this to avoid
+ * invoking kmemcheck for PnP BIOS calls.
+ */
+ if (regs->flags & X86_VM_MASK)
+ return false;
+ if (regs->cs != __KERNEL_CS)
+ return false;
+
+ pte = kmemcheck_pte_lookup(address);
+ if (!pte)
+ return false;
+
+ if (error_code & 2)
+ kmemcheck_access(regs, address, KMEMCHECK_WRITE);
+ else
+ kmemcheck_access(regs, address, KMEMCHECK_READ);
+
+ kmemcheck_show(regs);
+ return true;
+}
+
+bool kmemcheck_trap(struct pt_regs *regs)
+{
+ if (!kmemcheck_active(regs))
+ return false;
+
+ /* We're done. */
+ kmemcheck_hide(regs);
+ return true;
+}
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
new file mode 100644
index 000000000000..63c19e27aa6f
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -0,0 +1,106 @@
+#include <linux/types.h>
+
+#include "opcode.h"
+
+static bool opcode_is_prefix(uint8_t b)
+{
+ return
+ /* Group 1 */
+ b == 0xf0 || b == 0xf2 || b == 0xf3
+ /* Group 2 */
+ || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
+ || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e
+ /* Group 3 */
+ || b == 0x66
+ /* Group 4 */
+ || b == 0x67;
+}
+
+#ifdef CONFIG_X86_64
+static bool opcode_is_rex_prefix(uint8_t b)
+{
+ return (b & 0xf0) == 0x40;
+}
+#else
+static bool opcode_is_rex_prefix(uint8_t b)
+{
+ return false;
+}
+#endif
+
+#define REX_W (1 << 3)
+
+/*
+ * This is a VERY crude opcode decoder. We only need to find the size of the
+ * load/store that caused our #PF and this should work for all the opcodes
+ * that we care about. Moreover, the ones who invented this instruction set
+ * should be shot.
+ */
+void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size)
+{
+ /* Default operand size */
+ int operand_size_override = 4;
+
+ /* prefixes */
+ for (; opcode_is_prefix(*op); ++op) {
+ if (*op == 0x66)
+ operand_size_override = 2;
+ }
+
+ /* REX prefix */
+ if (opcode_is_rex_prefix(*op)) {
+ uint8_t rex = *op;
+
+ ++op;
+ if (rex & REX_W) {
+ switch (*op) {
+ case 0x63:
+ *size = 4;
+ return;
+ case 0x0f:
+ ++op;
+
+ switch (*op) {
+ case 0xb6:
+ case 0xbe:
+ *size = 1;
+ return;
+ case 0xb7:
+ case 0xbf:
+ *size = 2;
+ return;
+ }
+
+ break;
+ }
+
+ *size = 8;
+ return;
+ }
+ }
+
+ /* escape opcode */
+ if (*op == 0x0f) {
+ ++op;
+
+ /*
+ * This is move with zero-extend and sign-extend, respectively;
+ * we don't have to think about 0xb6/0xbe, because this is
+ * already handled in the conditional below.
+ */
+ if (*op == 0xb7 || *op == 0xbf)
+ operand_size_override = 2;
+ }
+
+ *size = (*op & 1) ? operand_size_override : 1;
+}
+
+const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op)
+{
+ /* skip prefixes */
+ while (opcode_is_prefix(*op))
+ ++op;
+ if (opcode_is_rex_prefix(*op))
+ ++op;
+ return op;
+}
diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h
new file mode 100644
index 000000000000..6956aad66b5b
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/opcode.h
@@ -0,0 +1,9 @@
+#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H
+#define ARCH__X86__MM__KMEMCHECK__OPCODE_H
+
+#include <linux/types.h>
+
+void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size);
+const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op);
+
+#endif
diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c
new file mode 100644
index 000000000000..4ead26eeaf96
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/pte.c
@@ -0,0 +1,22 @@
+#include <linux/mm.h>
+
+#include <asm/pgtable.h>
+
+#include "pte.h"
+
+pte_t *kmemcheck_pte_lookup(unsigned long address)
+{
+ pte_t *pte;
+ unsigned int level;
+
+ pte = lookup_address(address, &level);
+ if (!pte)
+ return NULL;
+ if (level != PG_LEVEL_4K)
+ return NULL;
+ if (!pte_hidden(*pte))
+ return NULL;
+
+ return pte;
+}
+
diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h
new file mode 100644
index 000000000000..9f5966456492
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/pte.h
@@ -0,0 +1,10 @@
+#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H
+#define ARCH__X86__MM__KMEMCHECK__PTE_H
+
+#include <linux/mm.h>
+
+#include <asm/pgtable.h>
+
+pte_t *kmemcheck_pte_lookup(unsigned long address);
+
+#endif
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
new file mode 100644
index 000000000000..036efbea8b28
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/selftest.c
@@ -0,0 +1,69 @@
+#include <linux/kernel.h>
+
+#include "opcode.h"
+#include "selftest.h"
+
+struct selftest_opcode {
+ unsigned int expected_size;
+ const uint8_t *insn;
+ const char *desc;
+};
+
+static const struct selftest_opcode selftest_opcodes[] = {
+ /* REP MOVS */
+ {1, "\xf3\xa4", "rep movsb <mem8>, <mem8>"},
+ {4, "\xf3\xa5", "rep movsl <mem32>, <mem32>"},
+
+ /* MOVZX / MOVZXD */
+ {1, "\x66\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg16>"},
+ {1, "\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg32>"},
+
+ /* MOVSX / MOVSXD */
+ {1, "\x66\x0f\xbe\x51\xf8", "movswq <mem8>, <reg16>"},
+ {1, "\x0f\xbe\x51\xf8", "movswq <mem8>, <reg32>"},
+
+#ifdef CONFIG_X86_64
+ /* MOVZX / MOVZXD */
+ {1, "\x49\x0f\xb6\x51\xf8", "movzbq <mem8>, <reg64>"},
+ {2, "\x49\x0f\xb7\x51\xf8", "movzbq <mem16>, <reg64>"},
+
+ /* MOVSX / MOVSXD */
+ {1, "\x49\x0f\xbe\x51\xf8", "movsbq <mem8>, <reg64>"},
+ {2, "\x49\x0f\xbf\x51\xf8", "movsbq <mem16>, <reg64>"},
+ {4, "\x49\x63\x51\xf8", "movslq <mem32>, <reg64>"},
+#endif
+};
+
+static bool selftest_opcode_one(const struct selftest_opcode *op)
+{
+ unsigned size;
+
+ kmemcheck_opcode_decode(op->insn, &size);
+
+ if (size == op->expected_size)
+ return true;
+
+ printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n",
+ op->desc, op->expected_size, size);
+ return false;
+}
+
+static bool selftest_opcodes_all(void)
+{
+ bool pass = true;
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i)
+ pass = pass && selftest_opcode_one(&selftest_opcodes[i]);
+
+ return pass;
+}
+
+bool kmemcheck_selftest(void)
+{
+ bool pass = true;
+
+ pass = pass && selftest_opcodes_all();
+
+ return pass;
+}
diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h
new file mode 100644
index 000000000000..8fed4fe11f95
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/selftest.h
@@ -0,0 +1,6 @@
+#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H
+#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H
+
+bool kmemcheck_selftest(void);
+
+#endif
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
new file mode 100644
index 000000000000..e773b6bd0079
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/shadow.c
@@ -0,0 +1,162 @@
+#include <linux/kmemcheck.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+
+#include "pte.h"
+#include "shadow.h"
+
+/*
+ * Return the shadow address for the given address. Returns NULL if the
+ * address is not tracked.
+ *
+ * We need to be extremely careful not to follow any invalid pointers,
+ * because this function can be called for *any* possible address.
+ */
+void *kmemcheck_shadow_lookup(unsigned long address)
+{
+ pte_t *pte;
+ struct page *page;
+
+ if (!virt_addr_valid(address))
+ return NULL;
+
+ pte = kmemcheck_pte_lookup(address);
+ if (!pte)
+ return NULL;
+
+ page = virt_to_page(address);
+ if (!page->shadow)
+ return NULL;
+ return page->shadow + (address & (PAGE_SIZE - 1));
+}
+
+static void mark_shadow(void *address, unsigned int n,
+ enum kmemcheck_shadow status)
+{
+ unsigned long addr = (unsigned long) address;
+ unsigned long last_addr = addr + n - 1;
+ unsigned long page = addr & PAGE_MASK;
+ unsigned long last_page = last_addr & PAGE_MASK;
+ unsigned int first_n;
+ void *shadow;
+
+ /* If the memory range crosses a page boundary, stop there. */
+ if (page == last_page)
+ first_n = n;
+ else
+ first_n = page + PAGE_SIZE - addr;
+
+ shadow = kmemcheck_shadow_lookup(addr);
+ if (shadow)
+ memset(shadow, status, first_n);
+
+ addr += first_n;
+ n -= first_n;
+
+ /* Do full-page memset()s. */
+ while (n >= PAGE_SIZE) {
+ shadow = kmemcheck_shadow_lookup(addr);
+ if (shadow)
+ memset(shadow, status, PAGE_SIZE);
+
+ addr += PAGE_SIZE;
+ n -= PAGE_SIZE;
+ }
+
+ /* Do the remaining page, if any. */
+ if (n > 0) {
+ shadow = kmemcheck_shadow_lookup(addr);
+ if (shadow)
+ memset(shadow, status, n);
+ }
+}
+
+void kmemcheck_mark_unallocated(void *address, unsigned int n)
+{
+ mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED);
+}
+
+void kmemcheck_mark_uninitialized(void *address, unsigned int n)
+{
+ mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED);
+}
+
+/*
+ * Fill the shadow memory of the given address such that the memory at that
+ * address is marked as being initialized.
+ */
+void kmemcheck_mark_initialized(void *address, unsigned int n)
+{
+ mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED);
+}
+EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized);
+
+void kmemcheck_mark_freed(void *address, unsigned int n)
+{
+ mark_shadow(address, n, KMEMCHECK_SHADOW_FREED);
+}
+
+void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n)
+{
+ unsigned int i;
+
+ for (i = 0; i < n; ++i)
+ kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE);
+}
+
+void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n)
+{
+ unsigned int i;
+
+ for (i = 0; i < n; ++i)
+ kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE);
+}
+
+void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
+{
+ unsigned int i;
+
+ for (i = 0; i < n; ++i)
+ kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE);
+}
+
+enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
+{
+ uint8_t *x;
+ unsigned int i;
+
+ x = shadow;
+
+#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
+ /*
+ * Make sure _some_ bytes are initialized. Gcc frequently generates
+ * code to access neighboring bytes.
+ */
+ for (i = 0; i < size; ++i) {
+ if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
+ return x[i];
+ }
+#else
+ /* All bytes must be initialized. */
+ for (i = 0; i < size; ++i) {
+ if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
+ return x[i];
+ }
+#endif
+
+ return x[0];
+}
+
+void kmemcheck_shadow_set(void *shadow, unsigned int size)
+{
+ uint8_t *x;
+ unsigned int i;
+
+ x = shadow;
+ for (i = 0; i < size; ++i)
+ x[i] = KMEMCHECK_SHADOW_INITIALIZED;
+}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
new file mode 100644
index 000000000000..af46d9ab9d86
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/shadow.h
@@ -0,0 +1,16 @@
+#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H
+#define ARCH__X86__MM__KMEMCHECK__SHADOW_H
+
+enum kmemcheck_shadow {
+ KMEMCHECK_SHADOW_UNALLOCATED,
+ KMEMCHECK_SHADOW_UNINITIALIZED,
+ KMEMCHECK_SHADOW_INITIALIZED,
+ KMEMCHECK_SHADOW_FREED,
+};
+
+void *kmemcheck_shadow_lookup(unsigned long address);
+
+enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
+void kmemcheck_shadow_set(void *shadow, unsigned int size);
+
+#endif
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 6ce9518fe2ac..3cfe9ced8a4c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -470,7 +470,7 @@ static int split_large_page(pte_t *kpte, unsigned long address)
if (!debug_pagealloc)
spin_unlock(&cpa_lock);
- base = alloc_pages(GFP_KERNEL, 0);
+ base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
if (!debug_pagealloc)
spin_lock(&cpa_lock);
if (!base)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 7aa03a5389f5..8e43bdd45456 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -4,9 +4,11 @@
#include <asm/tlb.h>
#include <asm/fixmap.h>
+#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+
pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
{
- return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
+ return (pte_t *)__get_free_page(PGALLOC_GFP);
}
pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
@@ -14,9 +16,9 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
struct page *pte;
#ifdef CONFIG_HIGHPTE
- pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
+ pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0);
#else
- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+ pte = alloc_pages(PGALLOC_GFP, 0);
#endif
if (pte)
pgtable_page_ctor(pte);
@@ -161,7 +163,7 @@ static int preallocate_pmds(pmd_t *pmds[])
bool failed = false;
for(i = 0; i < PREALLOCATED_PMDS; i++) {
- pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+ pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP);
if (pmd == NULL)
failed = true;
pmds[i] = pmd;
@@ -228,7 +230,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
pmd_t *pmds[PREALLOCATED_PMDS];
unsigned long flags;
- pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
if (pgd == NULL)
goto out;
diff --git a/crypto/xor.c b/crypto/xor.c
index 996b6ee57d9e..fc5b836f3430 100644
--- a/crypto/xor.c
+++ b/crypto/xor.c
@@ -101,7 +101,12 @@ calibrate_xor_blocks(void)
void *b1, *b2;
struct xor_block_template *f, *fastest;
- b1 = (void *) __get_free_pages(GFP_KERNEL, 2);
+ /*
+ * Note: Since the memory is not actually used for _anything_ but to
+ * test the XOR speed, we don't really want kmemcheck to warn about
+ * reading uninitialized bytes here.
+ */
+ b1 = (void *) __get_free_pages(GFP_KERNEL | __GFP_NOTRACK, 2);
if (!b1) {
printk(KERN_WARNING "xor: Yikes! No memory available.\n");
return -ENOMEM;
diff --git a/drivers/ieee1394/csr1212.c b/drivers/ieee1394/csr1212.c
index a6dfeb0b3372..e76cac64c533 100644
--- a/drivers/ieee1394/csr1212.c
+++ b/drivers/ieee1394/csr1212.c
@@ -35,6 +35,7 @@
#include <linux/errno.h>
#include <linux/kernel.h>
+#include <linux/kmemcheck.h>
#include <linux/string.h>
#include <asm/bug.h>
#include <asm/byteorder.h>
@@ -387,6 +388,7 @@ csr1212_new_descriptor_leaf(u8 dtype, u32 specifier_id,
if (!kv)
return NULL;
+ kmemcheck_annotate_variable(kv->value.leaf.data[0]);
CSR1212_DESCRIPTOR_LEAF_SET_TYPE(kv, dtype);
CSR1212_DESCRIPTOR_LEAF_SET_SPECIFIER_ID(kv, specifier_id);
diff --git a/drivers/ieee1394/nodemgr.c b/drivers/ieee1394/nodemgr.c
index a6d55bebe61a..5122b5a8aa2d 100644
--- a/drivers/ieee1394/nodemgr.c
+++ b/drivers/ieee1394/nodemgr.c
@@ -10,6 +10,7 @@
#include <linux/bitmap.h>
#include <linux/kernel.h>
+#include <linux/kmemcheck.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/delay.h>
@@ -39,7 +40,10 @@ struct nodemgr_csr_info {
struct hpsb_host *host;
nodeid_t nodeid;
unsigned int generation;
+
+ kmemcheck_bitfield_begin(flags);
unsigned int speed_unverified:1;
+ kmemcheck_bitfield_end(flags);
};
@@ -1293,6 +1297,7 @@ static void nodemgr_node_scan_one(struct hpsb_host *host,
u8 *speed;
ci = kmalloc(sizeof(*ci), GFP_KERNEL);
+ kmemcheck_annotate_bitfield(ci, flags);
if (!ci)
return;
diff --git a/drivers/misc/c2port/core.c b/drivers/misc/c2port/core.c
index 0207dd59090d..b5346b4db91a 100644
--- a/drivers/misc/c2port/core.c
+++ b/drivers/misc/c2port/core.c
@@ -15,6 +15,7 @@
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/kernel.h>
+#include <linux/kmemcheck.h>
#include <linux/ctype.h>
#include <linux/delay.h>
#include <linux/idr.h>
@@ -891,6 +892,7 @@ struct c2port_device *c2port_device_register(char *name,
return ERR_PTR(-EINVAL);
c2dev = kmalloc(sizeof(struct c2port_device), GFP_KERNEL);
+ kmemcheck_annotate_bitfield(c2dev, flags);
if (unlikely(!c2dev))
return ERR_PTR(-ENOMEM);
diff --git a/include/linux/c2port.h b/include/linux/c2port.h
index 7b5a2388ba67..2a5cd867c365 100644
--- a/include/linux/c2port.h
+++ b/include/linux/c2port.h
@@ -10,6 +10,7 @@
*/
#include <linux/device.h>
+#include <linux/kmemcheck.h>
#define C2PORT_NAME_LEN 32
@@ -20,8 +21,10 @@
/* Main struct */
struct c2port_ops;
struct c2port_device {
+ kmemcheck_bitfield_begin(flags);
unsigned int access:1;
unsigned int flash_access:1;
+ kmemcheck_bitfield_end(flags);
int id;
char name[C2PORT_NAME_LEN];
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3e045f2bed59..5804a2204b85 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1920,8 +1920,9 @@ extern void __init vfs_caches_init(unsigned long);
extern struct kmem_cache *names_cachep;
-#define __getname() kmem_cache_alloc(names_cachep, GFP_KERNEL)
-#define __putname(name) kmem_cache_free(names_cachep, (void *)(name))
+#define __getname_gfp(gfp) kmem_cache_alloc(names_cachep, (gfp))
+#define __getname() __getname_gfp(GFP_KERNEL)
+#define __putname(name) kmem_cache_free(names_cachep, (void *)(name))
#ifndef CONFIG_AUDITSYSCALL
#define putname(name) __putname(name)
#else
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0bbc15f54536..3885e7f75562 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -52,7 +52,19 @@ struct vm_area_struct;
#define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */
#define __GFP_MOVABLE ((__force gfp_t)0x100000u) /* Page is movable */
-#define __GFP_BITS_SHIFT 21 /* Room for 21 __GFP_FOO bits */
+#ifdef CONFIG_KMEMCHECK
+#define __GFP_NOTRACK ((__force gfp_t)0x200000u) /* Don't track with kmemcheck */
+#else
+#define __GFP_NOTRACK ((__force gfp_t)0)
+#endif
+
+/*
+ * This may seem redundant, but it's a way of annotating false positives vs.
+ * allocations that simply cannot be supported (e.g. page tables).
+ */
+#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
+
+#define __GFP_BITS_SHIFT 22 /* Room for 22 __GFP_FOO bits */
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
/* This equals 0, but use constants in case they ever change */
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index ff374ceface0..dd574d51bcaa 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -466,6 +466,20 @@ static inline void tasklet_hi_schedule(struct tasklet_struct *t)
__tasklet_hi_schedule(t);
}
+extern void __tasklet_hi_schedule_first(struct tasklet_struct *t);
+
+/*
+ * This version avoids touching any other tasklets. Needed for kmemcheck
+ * in order not to take any page faults while enqueueing this tasklet;
+ * consider VERY carefully whether you really need this or
+ * tasklet_hi_schedule()...
+ */
+static inline void tasklet_hi_schedule_first(struct tasklet_struct *t)
+{
+ if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
+ __tasklet_hi_schedule_first(t);
+}
+
static inline void tasklet_disable_nosync(struct tasklet_struct *t)
{
diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h
new file mode 100644
index 000000000000..0efbaacad7e5
--- /dev/null
+++ b/include/linux/kmemcheck.h
@@ -0,0 +1,157 @@
+#ifndef LINUX_KMEMCHECK_H
+#define LINUX_KMEMCHECK_H
+
+#include <linux/mm_types.h>
+#include <linux/types.h>
+
+#ifdef CONFIG_KMEMCHECK
+extern int kmemcheck_enabled;
+
+void kmemcheck_init(void);
+
+/* The slab-related functions. */
+void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node);
+void kmemcheck_free_shadow(struct page *page, int order);
+void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
+ size_t size);
+void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size);
+
+void kmemcheck_pagealloc_alloc(struct page *p, unsigned int order,
+ gfp_t gfpflags);
+
+void kmemcheck_show_pages(struct page *p, unsigned int n);
+void kmemcheck_hide_pages(struct page *p, unsigned int n);
+
+bool kmemcheck_page_is_tracked(struct page *p);
+
+void kmemcheck_mark_unallocated(void *address, unsigned int n);
+void kmemcheck_mark_uninitialized(void *address, unsigned int n);
+void kmemcheck_mark_initialized(void *address, unsigned int n);
+void kmemcheck_mark_freed(void *address, unsigned int n);
+
+void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n);
+void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n);
+void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n);
+
+int kmemcheck_show_addr(unsigned long address);
+int kmemcheck_hide_addr(unsigned long address);
+
+#else
+#define kmemcheck_enabled 0
+
+static inline void kmemcheck_init(void)
+{
+}
+
+static inline void
+kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
+{
+}
+
+static inline void
+kmemcheck_free_shadow(struct page *page, int order)
+{
+}
+
+static inline void
+kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
+ size_t size)
+{
+}
+
+static inline void kmemcheck_slab_free(struct kmem_cache *s, void *object,
+ size_t size)
+{
+}
+
+static inline void kmemcheck_pagealloc_alloc(struct page *p,
+ unsigned int order, gfp_t gfpflags)
+{
+}
+
+static inline bool kmemcheck_page_is_tracked(struct page *p)
+{
+ return false;
+}
+
+static inline void kmemcheck_mark_unallocated(void *address, unsigned int n)
+{
+}
+
+static inline void kmemcheck_mark_uninitialized(void *address, unsigned int n)
+{
+}
+
+static inline void kmemcheck_mark_initialized(void *address, unsigned int n)
+{
+}
+
+static inline void kmemcheck_mark_freed(void *address, unsigned int n)
+{
+}
+
+static inline void kmemcheck_mark_unallocated_pages(struct page *p,
+ unsigned int n)
+{
+}
+
+static inline void kmemcheck_mark_uninitialized_pages(struct page *p,
+ unsigned int n)
+{
+}
+
+static inline void kmemcheck_mark_initialized_pages(struct page *p,
+ unsigned int n)
+{
+}
+
+#endif /* CONFIG_KMEMCHECK */
+
+/*
+ * How to use: If you have a struct using bitfields, for example
+ *
+ * struct a {
+ * int x:8, y:8;
+ * };
+ *
+ * then this should be rewritten as
+ *
+ * struct a {
+ * kmemcheck_bitfield_begin(flags);
+ * int x:8, y:8;
+ * kmemcheck_bitfield_end(flags);
+ * };
+ *
+ * Now the "flags_begin" and "flags_end" members may be used to refer to the
+ * beginning and end, respectively, of the bitfield (and things like
+ * &x.flags_begin is allowed). As soon as the struct is allocated, the bit-
+ * fields should be annotated:
+ *
+ * struct a *a = kmalloc(sizeof(struct a), GFP_KERNEL);
+ * kmemcheck_annotate_bitfield(a, flags);
+ *
+ * Note: We provide the same definitions for both kmemcheck and non-
+ * kmemcheck kernels. This makes it harder to introduce accidental errors. It
+ * is also allowed to pass NULL pointers to kmemcheck_annotate_bitfield().
+ */
+#define kmemcheck_bitfield_begin(name) \
+ int name##_begin[0];
+
+#define kmemcheck_bitfield_end(name) \
+ int name##_end[0];
+
+#define kmemcheck_annotate_bitfield(ptr, name) \
+ do if (ptr) { \
+ int _n = (long) &((ptr)->name##_end) \
+ - (long) &((ptr)->name##_begin); \
+ BUILD_BUG_ON(_n < 0); \
+ \
+ kmemcheck_mark_initialized(&((ptr)->name##_begin), _n); \
+ } while (0)
+
+#define kmemcheck_annotate_variable(var) \
+ do { \
+ kmemcheck_mark_initialized(&(var), sizeof(var)); \
+ } while (0) \
+
+#endif /* LINUX_KMEMCHECK_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0e80e26ecf21..0042090a4d70 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -98,6 +98,14 @@ struct page {
#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
unsigned long debug_flags; /* Use atomic bitops on this */
#endif
+
+#ifdef CONFIG_KMEMCHECK
+ /*
+ * kmemcheck wants to track the status of each byte in a page; this
+ * is a pointer to such a status block. NULL if not tracked.
+ */
+ void *shadow;
+#endif
};
/*
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index f1345828c7c5..e1ec43124c2f 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -1,6 +1,7 @@
#ifndef _LINUX_RING_BUFFER_H
#define _LINUX_RING_BUFFER_H
+#include <linux/kmemcheck.h>
#include <linux/mm.h>
#include <linux/seq_file.h>
@@ -11,7 +12,10 @@ struct ring_buffer_iter;
* Don't refer to this struct directly, use functions below.
*/
struct ring_buffer_event {
+ kmemcheck_bitfield_begin(bitfield);
u32 type_len:5, time_delta:27;
+ kmemcheck_bitfield_end(bitfield);
+
u32 array[];
};
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index fa51293f2708..63ef24bc01d0 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -15,6 +15,7 @@
#define _LINUX_SKBUFF_H
#include <linux/kernel.h>
+#include <linux/kmemcheck.h>
#include <linux/compiler.h>
#include <linux/time.h>
#include <linux/cache.h>
@@ -343,6 +344,7 @@ struct sk_buff {
};
};
__u32 priority;
+ kmemcheck_bitfield_begin(flags1);
__u8 local_df:1,
cloned:1,
ip_summed:2,
@@ -353,6 +355,7 @@ struct sk_buff {
ipvs_property:1,
peeked:1,
nf_trace:1;
+ kmemcheck_bitfield_end(flags1);
__be16 protocol;
void (*destructor)(struct sk_buff *skb);
@@ -372,12 +375,16 @@ struct sk_buff {
__u16 tc_verd; /* traffic control verdict */
#endif
#endif
+
+ kmemcheck_bitfield_begin(flags2);
#ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8 ndisc_nodetype:2;
#endif
#if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE)
__u8 do_not_encrypt:1;
#endif
+ kmemcheck_bitfield_end(flags2);
+
/* 0/13/14 bit hole */
#ifdef CONFIG_NET_DMA
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 76f21a61db32..6cbe637eac68 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -64,6 +64,13 @@
#define SLAB_NOLEAKTRACE 0x00800000UL /* Avoid kmemleak tracing */
+/* Don't track use of uninitialized memory */
+#ifdef CONFIG_KMEMCHECK
+# define SLAB_NOTRACK 0x01000000UL
+#else
+# define SLAB_NOTRACK 0x00000000UL
+#endif
+
/* The following flags affect the page allocator grouping pages by mobility */
#define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */
#define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 713f841ecaa9..850d057500de 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -16,6 +16,87 @@
#include <linux/compiler.h>
#include <linux/kmemtrace.h>
+/*
+ * struct kmem_cache
+ *
+ * manages a cache.
+ */
+
+struct kmem_cache {
+/* 1) per-cpu data, touched during every alloc/free */
+ struct array_cache *array[NR_CPUS];
+/* 2) Cache tunables. Protected by cache_chain_mutex */
+ unsigned int batchcount;
+ unsigned int limit;
+ unsigned int shared;
+
+ unsigned int buffer_size;
+ u32 reciprocal_buffer_size;
+/* 3) touched by every alloc & free from the backend */
+
+ unsigned int flags; /* constant flags */
+ unsigned int num; /* # of objs per slab */
+
+/* 4) cache_grow/shrink */
+ /* order of pgs per slab (2^n) */
+ unsigned int gfporder;
+
+ /* force GFP flags, e.g. GFP_DMA */
+ gfp_t gfpflags;
+
+ size_t colour; /* cache colouring range */
+ unsigned int colour_off; /* colour offset */
+ struct kmem_cache *slabp_cache;
+ unsigned int slab_size;
+ unsigned int dflags; /* dynamic flags */
+
+ /* constructor func */
+ void (*ctor)(void *obj);
+
+/* 5) cache creation/removal */
+ const char *name;
+ struct list_head next;
+
+/* 6) statistics */
+#ifdef CONFIG_DEBUG_SLAB
+ unsigned long num_active;
+ unsigned long num_allocations;
+ unsigned long high_mark;
+ unsigned long grown;
+ unsigned long reaped;
+ unsigned long errors;
+ unsigned long max_freeable;
+ unsigned long node_allocs;
+ unsigned long node_frees;
+ unsigned long node_overflow;
+ atomic_t allochit;
+ atomic_t allocmiss;
+ atomic_t freehit;
+ atomic_t freemiss;
+
+ /*
+ * If debugging is enabled, then the allocator can add additional
+ * fields and/or padding to every object. buffer_size contains the total
+ * object size including these internal fields, the following two
+ * variables contain the offset to the user object and its size.
+ */
+ int obj_offset;
+ int obj_size;
+#endif /* CONFIG_DEBUG_SLAB */
+
+ /*
+ * We put nodelists[] at the end of kmem_cache, because we want to size
+ * this array to nr_node_ids slots instead of MAX_NUMNODES
+ * (see kmem_cache_init())
+ * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache
+ * is statically defined, so we reserve the max number of nodes.
+ */
+ struct kmem_list3 *nodelists[MAX_NUMNODES];
+ /*
+ * Do not add fields after nodelists[]
+ */
+};
+
/* Size description struct for general caches. */
struct cache_sizes {
size_t cs_size;
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index 1a8cecc4f38c..51efbef38fb0 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -4,6 +4,8 @@
struct task_struct;
#ifdef CONFIG_STACKTRACE
+struct task_struct;
+
struct stack_trace {
unsigned int nr_entries, max_entries;
unsigned long *entries;
@@ -11,6 +13,7 @@ struct stack_trace {
};
extern void save_stack_trace(struct stack_trace *trace);
+extern void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp);
extern void save_stack_trace_tsk(struct task_struct *tsk,
struct stack_trace *trace);
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 20a6957af870..47004f35cc7e 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -17,6 +17,7 @@
#define _INET_SOCK_H
+#include <linux/kmemcheck.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/jhash.h>
@@ -66,14 +67,16 @@ struct inet_request_sock {
__be32 loc_addr;
__be32 rmt_addr;
__be16 rmt_port;
- u16 snd_wscale : 4,
- rcv_wscale : 4,
+ kmemcheck_bitfield_begin(flags);
+ u16 snd_wscale : 4,
+ rcv_wscale : 4,
tstamp_ok : 1,
sack_ok : 1,
wscale_ok : 1,
ecn_ok : 1,
acked : 1,
no_srccheck: 1;
+ kmemcheck_bitfield_end(flags);
struct ip_options *opt;
};
@@ -199,9 +202,12 @@ static inline int inet_sk_ehashfn(const struct sock *sk)
static inline struct request_sock *inet_reqsk_alloc(struct request_sock_ops *ops)
{
struct request_sock *req = reqsk_alloc(ops);
+ struct inet_request_sock *ireq = inet_rsk(req);
- if (req != NULL)
- inet_rsk(req)->opt = NULL;
+ if (req != NULL) {
+ kmemcheck_annotate_bitfield(ireq, flags);
+ ireq->opt = NULL;
+ }
return req;
}
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 4b8ece22b8e9..b63b80fac567 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -16,6 +16,7 @@
#define _INET_TIMEWAIT_SOCK_
+#include <linux/kmemcheck.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/timer.h>
@@ -127,10 +128,12 @@ struct inet_timewait_sock {
__be32 tw_rcv_saddr;
__be16 tw_dport;
__u16 tw_num;
+ kmemcheck_bitfield_begin(flags);
/* And these are ours. */
__u8 tw_ipv6only:1,
tw_transparent:1;
- /* 15 bits hole, try to pack */
+ /* 14 bits hole, try to pack */
+ kmemcheck_bitfield_end(flags);
__u16 tw_ipv6_offset;
unsigned long tw_ttd;
struct inet_bind_bucket *tw_tb;
diff --git a/include/net/sock.h b/include/net/sock.h
index 4bb1ff9fd15b..d933da00d505 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -218,9 +218,11 @@ struct sock {
#define sk_hash __sk_common.skc_hash
#define sk_prot __sk_common.skc_prot
#define sk_net __sk_common.skc_net
+ kmemcheck_bitfield_begin(flags);
unsigned char sk_shutdown : 2,
sk_no_check : 2,
sk_userlocks : 4;
+ kmemcheck_bitfield_end(flags);
unsigned char sk_protocol;
unsigned short sk_type;
int sk_rcvbuf;
diff --git a/init/do_mounts.c b/init/do_mounts.c
index dd7ee5f203f3..093f65915501 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -231,7 +231,8 @@ static int __init do_mount_root(char *name, char *fs, int flags, void *data)
void __init mount_block_root(char *name, int flags)
{
- char *fs_names = __getname();
+ char *fs_names = __getname_gfp(GFP_KERNEL
+ | __GFP_NOTRACK_FALSE_POSITIVE);
char *p;
#ifdef CONFIG_BLOCK
char b[BDEVNAME_SIZE];
diff --git a/init/main.c b/init/main.c
index 748f79f795d8..72a9c6c04ac1 100644
--- a/init/main.c
+++ b/init/main.c
@@ -63,6 +63,7 @@
#include <linux/sched.h>
#include <linux/signal.h>
#include <linux/idr.h>
+#include <linux/kmemcheck.h>
#include <linux/ftrace.h>
#include <linux/async.h>
#include <linux/kmemtrace.h>
@@ -787,6 +788,9 @@ static void __init do_pre_smp_initcalls(void)
{
initcall_t *call;
+ /* kmemcheck must initialize before all early initcalls: */
+ kmemcheck_init();
+
for (call = __initcall_start; call < __early_initcall_end; call++)
do_one_initcall(*call);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index bb762b4dd217..7f8571025411 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -177,7 +177,7 @@ void __init fork_init(unsigned long mempages)
/* create a slab on which task_structs can be allocated */
task_struct_cachep =
kmem_cache_create("task_struct", sizeof(struct task_struct),
- ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL);
+ ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
#endif
/* do the arch specific task caches init */
@@ -1458,20 +1458,20 @@ void __init proc_caches_init(void)
{
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
- sighand_ctor);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
+ SLAB_NOTRACK, sighand_ctor);
signal_cachep = kmem_cache_create("signal_cache",
sizeof(struct signal_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
files_cachep = kmem_cache_create("files_cache",
sizeof(struct files_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
fs_cachep = kmem_cache_create("fs_cache",
sizeof(struct fs_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
mm_cachep = kmem_cache_create("mm_struct",
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
mmap_init();
}
diff --git a/kernel/signal.c b/kernel/signal.c
index 809a228019ad..d81f4952eebb 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -832,6 +832,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
{
struct sigpending *pending;
struct sigqueue *q;
+ int override_rlimit;
trace_sched_signal_send(sig, t);
@@ -863,9 +864,13 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
make sure at least one signal gets delivered and don't
pass on the info struct. */
- q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
- (is_si_special(info) ||
- info->si_code >= 0)));
+ if (sig < SIGRTMIN)
+ override_rlimit = (is_si_special(info) || info->si_code >= 0);
+ else
+ override_rlimit = 0;
+
+ q = __sigqueue_alloc(t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
+ override_rlimit);
if (q) {
list_add_tail(&q->list, &pending->list);
switch ((unsigned long) info) {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 258885a543db..b41fb710e114 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -382,6 +382,17 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
EXPORT_SYMBOL(__tasklet_hi_schedule);
+void __tasklet_hi_schedule_first(struct tasklet_struct *t)
+{
+ BUG_ON(!irqs_disabled());
+
+ t->next = __get_cpu_var(tasklet_hi_vec).head;
+ __get_cpu_var(tasklet_hi_vec).head = t;
+ __raise_softirq_irqoff(HI_SOFTIRQ);
+}
+
+EXPORT_SYMBOL(__tasklet_hi_schedule_first);
+
static void tasklet_action(struct softirq_action *a)
{
struct tasklet_struct *list;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 439c042428c0..892c4556e97b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -27,6 +27,7 @@
#include <linux/security.h>
#include <linux/ctype.h>
#include <linux/utsname.h>
+#include <linux/kmemcheck.h>
#include <linux/smp_lock.h>
#include <linux/fs.h>
#include <linux/init.h>
@@ -1298,6 +1299,16 @@ static struct ctl_table vm_table[] = {
.extra2 = &one,
},
#endif
+#ifdef CONFIG_KMEMCHECK
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "kmemcheck",
+ .data = &kmemcheck_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
#ifdef CONFIG_UNEVICTABLE_LRU
{
.ctl_name = CTL_UNNUMBERED,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7102d7a2fadb..dd20b4de49fb 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -10,6 +10,7 @@
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/hardirq.h>
+#include <linux/kmemcheck.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/mutex.h>
@@ -1265,6 +1266,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
if (tail < BUF_PAGE_SIZE) {
/* Mark the rest of the page with padding */
event = __rb_page_index(tail_page, tail);
+ kmemcheck_annotate_bitfield(event, bitfield);
rb_event_set_padding(event);
}
@@ -1322,6 +1324,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
return NULL;
event = __rb_page_index(tail_page, tail);
+ kmemcheck_annotate_bitfield(event, bitfield);
rb_update_event(event, type, length);
/* The passed in type is zero for DATA */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 444002b18f5d..6327c86d1682 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -300,7 +300,7 @@ config DEBUG_OBJECTS_ENABLE_DEFAULT
config DEBUG_SLAB
bool "Debug slab memory allocations"
- depends on DEBUG_KERNEL && SLAB
+ depends on DEBUG_KERNEL && SLAB && !KMEMCHECK
help
Say Y here to have the kernel do limited verification on memory
allocation as well as poisoning memory on free to catch use of freed
@@ -312,7 +312,7 @@ config DEBUG_SLAB_LEAK
config SLUB_DEBUG_ON
bool "SLUB debugging on by default"
- depends on SLUB && SLUB_DEBUG
+ depends on SLUB && SLUB_DEBUG && !KMEMCHECK
default n
help
Boot with debugging on by default. SLUB boots by default with
@@ -1016,3 +1016,5 @@ config DMA_API_DEBUG
source "samples/Kconfig"
source "lib/Kconfig.kgdb"
+
+source "lib/Kconfig.kmemcheck"
diff --git a/lib/Kconfig.kmemcheck b/lib/Kconfig.kmemcheck
new file mode 100644
index 000000000000..603c81b66549
--- /dev/null
+++ b/lib/Kconfig.kmemcheck
@@ -0,0 +1,91 @@
+config HAVE_ARCH_KMEMCHECK
+ bool
+
+menuconfig KMEMCHECK
+ bool "kmemcheck: trap use of uninitialized memory"
+ depends on DEBUG_KERNEL
+ depends on !X86_USE_3DNOW
+ depends on SLUB || SLAB
+ depends on !CC_OPTIMIZE_FOR_SIZE
+ depends on !FUNCTION_TRACER
+ select FRAME_POINTER
+ select STACKTRACE
+ default n
+ help
+ This option enables tracing of dynamically allocated kernel memory
+ to see if memory is used before it has been given an initial value.
+ Be aware that this requires half of your memory for bookkeeping and
+ will insert extra code at *every* read and write to tracked memory
+ thus slow down the kernel code (but user code is unaffected).
+
+ The kernel may be started with kmemcheck=0 or kmemcheck=1 to disable
+ or enable kmemcheck at boot-time. If the kernel is started with
+ kmemcheck=0, the large memory and CPU overhead is not incurred.
+
+choice
+ prompt "kmemcheck: default mode at boot"
+ depends on KMEMCHECK
+ default KMEMCHECK_ONESHOT_BY_DEFAULT
+ help
+ This option controls the default behaviour of kmemcheck when the
+ kernel boots and no kmemcheck= parameter is given.
+
+config KMEMCHECK_DISABLED_BY_DEFAULT
+ bool "disabled"
+ depends on KMEMCHECK
+
+config KMEMCHECK_ENABLED_BY_DEFAULT
+ bool "enabled"
+ depends on KMEMCHECK
+
+config KMEMCHECK_ONESHOT_BY_DEFAULT
+ bool "one-shot"
+ depends on KMEMCHECK
+ help
+ In one-shot mode, only the first error detected is reported before
+ kmemcheck is disabled.
+
+endchoice
+
+config KMEMCHECK_QUEUE_SIZE
+ int "kmemcheck: error queue size"
+ depends on KMEMCHECK
+ default 64
+ help
+ Select the maximum number of errors to store in the queue. Since
+ errors can occur virtually anywhere and in any context, we need a
+ temporary storage area which is guarantueed not to generate any
+ other faults. The queue will be emptied as soon as a tasklet may
+ be scheduled. If the queue is full, new error reports will be
+ lost.
+
+config KMEMCHECK_SHADOW_COPY_SHIFT
+ int "kmemcheck: shadow copy size (5 => 32 bytes, 6 => 64 bytes)"
+ depends on KMEMCHECK
+ range 2 8
+ default 5
+ help
+ Select the number of shadow bytes to save along with each entry of
+ the queue. These bytes indicate what parts of an allocation are
+ initialized, uninitialized, etc. and will be displayed when an
+ error is detected to help the debugging of a particular problem.
+
+config KMEMCHECK_PARTIAL_OK
+ bool "kmemcheck: allow partially uninitialized memory"
+ depends on KMEMCHECK
+ default y
+ help
+ This option works around certain GCC optimizations that produce
+ 32-bit reads from 16-bit variables where the upper 16 bits are
+ thrown away afterwards. This may of course also hide some real
+ bugs.
+
+config KMEMCHECK_BITOPS_OK
+ bool "kmemcheck: allow bit-field manipulation"
+ depends on KMEMCHECK
+ default n
+ help
+ This option silences warnings that would be generated for bit-field
+ accesses where not all the bits are initialized at the same time.
+ This may also hide some real bugs.
+
diff --git a/mm/Makefile b/mm/Makefile
index fe8ebc2fdb7a..3f83260ec1eb 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_SLUB) += slub.o
obj-$(CONFIG_SLQB) += slqb.o
+obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
new file mode 100644
index 000000000000..fd814fd61319
--- /dev/null
+++ b/mm/kmemcheck.c
@@ -0,0 +1,122 @@
+#include <linux/gfp.h>
+#include <linux/mm_types.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/kmemcheck.h>
+
+void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
+{
+ struct page *shadow;
+ int pages;
+ int i;
+
+ pages = 1 << order;
+
+ /*
+ * With kmemcheck enabled, we need to allocate a memory area for the
+ * shadow bits as well.
+ */
+ shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order);
+ if (!shadow) {
+ if (printk_ratelimit())
+ printk(KERN_ERR "kmemcheck: failed to allocate "
+ "shadow bitmap\n");
+ return;
+ }
+
+ for(i = 0; i < pages; ++i)
+ page[i].shadow = page_address(&shadow[i]);
+
+ /*
+ * Mark it as non-present for the MMU so that our accesses to
+ * this memory will trigger a page fault and let us analyze
+ * the memory accesses.
+ */
+ kmemcheck_hide_pages(page, pages);
+}
+
+void kmemcheck_free_shadow(struct page *page, int order)
+{
+ struct page *shadow;
+ int pages;
+ int i;
+
+ if (!kmemcheck_page_is_tracked(page))
+ return;
+
+ pages = 1 << order;
+
+ kmemcheck_show_pages(page, pages);
+
+ shadow = virt_to_page(page[0].shadow);
+
+ for(i = 0; i < pages; ++i)
+ page[i].shadow = NULL;
+
+ __free_pages(shadow, order);
+}
+
+void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
+ size_t size)
+{
+ /*
+ * Has already been memset(), which initializes the shadow for us
+ * as well.
+ */
+ if (gfpflags & __GFP_ZERO)
+ return;
+
+ /* No need to initialize the shadow of a non-tracked slab. */
+ if (s->flags & SLAB_NOTRACK)
+ return;
+
+ if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) {
+ /*
+ * Allow notracked objects to be allocated from
+ * tracked caches. Note however that these objects
+ * will still get page faults on access, they just
+ * won't ever be flagged as uninitialized. If page
+ * faults are not acceptable, the slab cache itself
+ * should be marked NOTRACK.
+ */
+ kmemcheck_mark_initialized(object, size);
+ } else if (!s->ctor) {
+ /*
+ * New objects should be marked uninitialized before
+ * they're returned to the called.
+ */
+ kmemcheck_mark_uninitialized(object, size);
+ }
+}
+
+void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size)
+{
+ /* TODO: RCU freeing is unsupported for now; hide false positives. */
+ if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU))
+ kmemcheck_mark_freed(object, size);
+}
+
+void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order,
+ gfp_t gfpflags)
+{
+ int pages;
+
+ if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK))
+ return;
+
+ pages = 1 << order;
+
+ /*
+ * NOTE: We choose to track GFP_ZERO pages too; in fact, they
+ * can become uninitialized by copying uninitialized memory
+ * into them.
+ */
+
+ /* XXX: Can use zone->node for node? */
+ kmemcheck_alloc_shadow(page, order, gfpflags, -1);
+
+ if (gfpflags & __GFP_ZERO)
+ kmemcheck_mark_initialized_pages(page, pages);
+ else
+ kmemcheck_mark_uninitialized_pages(page, pages);
+}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 17d5f539a9aa..0727896a88ac 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -23,6 +23,7 @@
#include <linux/bootmem.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
+#include <linux/kmemcheck.h>
#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/pagevec.h>
@@ -546,6 +547,8 @@ static void __free_pages_ok(struct page *page, unsigned int order)
int i;
int bad = 0;
+ kmemcheck_free_shadow(page, order);
+
for (i = 0 ; i < (1 << order) ; ++i)
bad += free_pages_check(page + i);
if (bad)
@@ -994,6 +997,8 @@ static void free_hot_cold_page(struct page *page, int cold)
struct per_cpu_pages *pcp;
unsigned long flags;
+ kmemcheck_free_shadow(page, 0);
+
if (PageAnon(page))
page->mapping = NULL;
if (free_pages_check(page))
@@ -1047,6 +1052,16 @@ void split_page(struct page *page, unsigned int order)
VM_BUG_ON(PageCompound(page));
VM_BUG_ON(!page_count(page));
+
+#ifdef CONFIG_KMEMCHECK
+ /*
+ * Split shadow pages too, because free(page[0]) would
+ * otherwise free the whole shadow.
+ */
+ if (kmemcheck_page_is_tracked(page))
+ split_page(virt_to_page(page[0].shadow), order);
+#endif
+
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
}
@@ -1667,7 +1682,10 @@ nopage:
dump_stack();
show_mem();
}
+ return page;
got_pg:
+ if (kmemcheck_enabled)
+ kmemcheck_pagealloc_alloc(page, order, gfp_mask);
return page;
}
EXPORT_SYMBOL(__alloc_pages_internal);
diff --git a/mm/slab.c b/mm/slab.c
index eb96d3bba9f9..c261f03bc4c8 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -114,6 +114,7 @@
#include <linux/rtmutex.h>
#include <linux/reciprocal_div.h>
#include <linux/debugobjects.h>
+#include <linux/kmemcheck.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
@@ -179,13 +180,13 @@
SLAB_STORE_USER | \
SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
- SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE)
+ SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
#else
# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
SLAB_CACHE_DMA | \
SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
- SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE)
+ SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
#endif
/*
@@ -374,87 +375,6 @@ static void kmem_list3_init(struct kmem_list3 *parent)
MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
} while (0)
-/*
- * struct kmem_cache
- *
- * manages a cache.
- */
-
-struct kmem_cache {
-/* 1) per-cpu data, touched during every alloc/free */
- struct array_cache *array[NR_CPUS];
-/* 2) Cache tunables. Protected by cache_chain_mutex */
- unsigned int batchcount;
- unsigned int limit;
- unsigned int shared;
-
- unsigned int buffer_size;
- u32 reciprocal_buffer_size;
-/* 3) touched by every alloc & free from the backend */
-
- unsigned int flags; /* constant flags */
- unsigned int num; /* # of objs per slab */
-
-/* 4) cache_grow/shrink */
- /* order of pgs per slab (2^n) */
- unsigned int gfporder;
-
- /* force GFP flags, e.g. GFP_DMA */
- gfp_t gfpflags;
-
- size_t colour; /* cache colouring range */
- unsigned int colour_off; /* colour offset */
- struct kmem_cache *slabp_cache;
- unsigned int slab_size;
- unsigned int dflags; /* dynamic flags */
-
- /* constructor func */
- void (*ctor)(void *obj);
-
-/* 5) cache creation/removal */
- const char *name;
- struct list_head next;
-
-/* 6) statistics */
-#if STATS
- unsigned long num_active;
- unsigned long num_allocations;
- unsigned long high_mark;
- unsigned long grown;
- unsigned long reaped;
- unsigned long errors;
- unsigned long max_freeable;
- unsigned long node_allocs;
- unsigned long node_frees;
- unsigned long node_overflow;
- atomic_t allochit;
- atomic_t allocmiss;
- atomic_t freehit;
- atomic_t freemiss;
-#endif
-#if DEBUG
- /*
- * If debugging is enabled, then the allocator can add additional
- * fields and/or padding to every object. buffer_size contains the total
- * object size including these internal fields, the following two
- * variables contain the offset to the user object and its size.
- */
- int obj_offset;
- int obj_size;
-#endif
- /*
- * We put nodelists[] at the end of kmem_cache, because we want to size
- * this array to nr_node_ids slots instead of MAX_NUMNODES
- * (see kmem_cache_init())
- * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache
- * is statically defined, so we reserve the max number of nodes.
- */
- struct kmem_list3 *nodelists[MAX_NUMNODES];
- /*
- * Do not add fields after nodelists[]
- */
-};
-
#define CFLGS_OFF_SLAB (0x80000000UL)
#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
@@ -1698,7 +1618,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
flags |= __GFP_RECLAIMABLE;
- page = alloc_pages_node(nodeid, flags, cachep->gfporder);
+ page = alloc_pages_node(nodeid, flags & ~__GFP_NOTRACK, cachep->gfporder);
if (!page)
return NULL;
@@ -1711,6 +1631,16 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
NR_SLAB_UNRECLAIMABLE, nr_pages);
for (i = 0; i < nr_pages; i++)
__SetPageSlab(page + i);
+
+ if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
+ kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
+
+ if (cachep->ctor)
+ kmemcheck_mark_uninitialized_pages(page, nr_pages);
+ else
+ kmemcheck_mark_unallocated_pages(page, nr_pages);
+ }
+
return page_address(page);
}
@@ -1723,6 +1653,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
struct page *page = virt_to_page(addr);
const unsigned long nr_freed = i;
+ kmemcheck_free_shadow(page, cachep->gfporder);
+
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
sub_zone_page_state(page_zone(page),
NR_SLAB_RECLAIMABLE, nr_freed);
@@ -3395,6 +3327,9 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags,
flags);
+ if (likely(ptr))
+ kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep));
+
if (unlikely((flags & __GFP_ZERO) && ptr))
memset(ptr, 0, obj_size(cachep));
@@ -3453,6 +3388,9 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
flags);
prefetchw(objp);
+ if (likely(objp))
+ kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep));
+
if (unlikely((flags & __GFP_ZERO) && objp))
memset(objp, 0, obj_size(cachep));
@@ -3569,6 +3507,8 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
kmemleak_free_recursive(objp, cachep->flags);
objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
+ kmemcheck_slab_free(cachep, objp, obj_size(cachep));
+
/*
* Skip calling cache_free_alien() when the platform is not numa.
* This will avoid cache misses that happen while accessing slabp (which
diff --git a/mm/slub.c b/mm/slub.c
index 6674a7907b0c..f7c39f00d972 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -27,6 +27,7 @@
#include <linux/kallsyms.h>
#include <linux/memory.h>
#include <linux/math64.h>
+#include <linux/kmemcheck.h>
#include <linux/fault-inject.h>
/*
@@ -147,7 +148,7 @@
SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE)
#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
- SLAB_CACHE_DMA)
+ SLAB_CACHE_DMA | SLAB_NOTRACK)
#ifndef ARCH_KMALLOC_MINALIGN
#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
@@ -1065,6 +1066,8 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node,
{
int order = oo_order(oo);
+ flags |= __GFP_NOTRACK;
+
if (node == -1)
return alloc_pages(flags, order);
else
@@ -1092,6 +1095,24 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK);
}
+
+ if (kmemcheck_enabled
+ && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS)))
+ {
+ int pages = 1 << oo_order(oo);
+
+ kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
+
+ /*
+ * Objects from caches that have a constructor don't get
+ * cleared when they're allocated, so we need to do it here.
+ */
+ if (s->ctor)
+ kmemcheck_mark_uninitialized_pages(page, pages);
+ else
+ kmemcheck_mark_unallocated_pages(page, pages);
+ }
+
page->objects = oo_objects(oo);
mod_zone_page_state(page_zone(page),
(s->flags & SLAB_RECLAIM_ACCOUNT) ?
@@ -1165,6 +1186,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
__ClearPageSlubDebug(page);
}
+ kmemcheck_free_shadow(page, compound_order(page));
+
mod_zone_page_state(page_zone(page),
(s->flags & SLAB_RECLAIM_ACCOUNT) ?
NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
@@ -1619,6 +1642,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
memset(object, 0, objsize);
kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags);
+ kmemcheck_slab_alloc(s, gfpflags, object, c->objsize);
return object;
}
@@ -1751,6 +1775,7 @@ static __always_inline void slab_free(struct kmem_cache *s,
kmemleak_free_recursive(x, s->flags);
local_irq_save(flags);
c = get_cpu_slab(s, smp_processor_id());
+ kmemcheck_slab_free(s, object, c->objsize);
debug_check_no_locks_freed(object, c->objsize);
if (!(s->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(object, c->objsize);
@@ -2622,7 +2647,8 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
if (!s || !text || !kmem_cache_open(s, flags, text,
realsize, ARCH_KMALLOC_MINALIGN,
- SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) {
+ SLAB_CACHE_DMA|SLAB_NOTRACK|__SYSFS_ADD_DEFERRED,
+ NULL)) {
kfree(s);
kfree(text);
goto unlock_out;
@@ -2716,9 +2742,10 @@ EXPORT_SYMBOL(__kmalloc);
static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
{
- struct page *page = alloc_pages_node(node, flags | __GFP_COMP,
- get_order(size));
+ struct page *page;
+ flags |= __GFP_COMP | __GFP_NOTRACK;
+ page = alloc_pages_node(node, flags, get_order(size));
if (page)
return page_address(page);
else
@@ -4393,6 +4420,8 @@ static char *create_unique_id(struct kmem_cache *s)
*p++ = 'a';
if (s->flags & SLAB_DEBUG_FREE)
*p++ = 'F';
+ if (!(s->flags & SLAB_NOTRACK))
+ *p++ = 't';
if (p != name + 1)
*p++ = '-';
p += sprintf(p, "%07d", s->size);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 1a94a3037370..5c93435b0347 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -39,6 +39,7 @@
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
+#include <linux/kmemcheck.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/in.h>
@@ -201,6 +202,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
skb->data = data;
skb_reset_tail_pointer(skb);
skb->end = skb->tail + size;
+ kmemcheck_annotate_bitfield(skb, flags1);
+ kmemcheck_annotate_bitfield(skb, flags2);
/* make sure we initialize shinfo sequentially */
shinfo = skb_shinfo(skb);
atomic_set(&shinfo->dataref, 1);
@@ -217,6 +220,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
struct sk_buff *child = skb + 1;
atomic_t *fclone_ref = (atomic_t *) (child + 1);
+ kmemcheck_annotate_bitfield(child, flags1);
+ kmemcheck_annotate_bitfield(child, flags2);
skb->fclone = SKB_FCLONE_ORIG;
atomic_set(fclone_ref, 1);
@@ -635,6 +640,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
if (!n)
return NULL;
+
+ kmemcheck_annotate_bitfield(n, flags1);
+ kmemcheck_annotate_bitfield(n, flags2);
n->fclone = SKB_FCLONE_UNAVAILABLE;
}
diff --git a/net/core/sock.c b/net/core/sock.c
index 04e35eb2e736..3ead2253a363 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -945,6 +945,8 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
sk = kmalloc(prot->obj_size, priority);
if (sk != NULL) {
+ kmemcheck_annotate_bitfield(sk, flags);
+
if (security_sk_alloc(sk, family, priority))
goto out_free;
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 68a8d892c711..61283f928825 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -9,6 +9,7 @@
*/
#include <linux/kernel.h>
+#include <linux/kmemcheck.h>
#include <net/inet_hashtables.h>
#include <net/inet_timewait_sock.h>
#include <net/ip.h>
@@ -120,6 +121,8 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
if (tw != NULL) {
const struct inet_sock *inet = inet_sk(sk);
+ kmemcheck_annotate_bitfield(tw, flags);
+
/* Give us an identity. */
tw->tw_daddr = inet->daddr;
tw->tw_rcv_saddr = inet->rcv_saddr;