From 948f984df52511bb0efa5c026813b0c34de43aa0 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 20 May 2008 14:39:25 +0300 Subject: core, x86: make LIST_POISON less deadly The list macros use LIST_POISON1 and LIST_POISON2 as undereferencable pointers in order to trap erronous use of freed list_heads. Unfortunately userspace can arrange for those pointers to actually be dereferencable, potentially turning an oops to an expolit. To avoid this allow architectures (currently x86_64 only) to override the default values for these pointers with truly-undereferencable values. This is easy on x86_64 as the virtual address space is large and contains permanently unmapped ranges. Other 64-bit architectures will likely find similar unmapped ranges. Signed-off-by: Avi Kivity Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 5 +++++ include/linux/poison.h | 10 ++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fe361ae7ef2f..e21cc33b794a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1021,6 +1021,11 @@ config ARCH_MEMORY_PROBE def_bool X86_64 depends on MEMORY_HOTPLUG +config ILLEGAL_POINTER_VALUE + hex + default 0 if X86_32 + default 0xffffc10000000000 if X86_64 + source "mm/Kconfig" config HIGHPTE diff --git a/include/linux/poison.h b/include/linux/poison.h index 9f31683728fd..0d105a56e668 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -1,14 +1,20 @@ #ifndef _LINUX_POISON_H #define _LINUX_POISON_H +#ifdef CONFIG_ILLEGAL_POINTER_VALUE +#define POISON_POINTER_DELTA CONFIG_ILLEGAL_POINTER_VALUE +#else +#define POISON_POINTER_DELTA 0L +#endif + /********** include/linux/list.h **********/ /* * These are non-NULL pointers that will result in page faults * under normal circumstances, used to verify that nobody uses * non-initialized list entries. */ -#define LIST_POISON1 ((void *) 0x00100100) -#define LIST_POISON2 ((void *) 0x00200200) +#define LIST_POISON1 ((void *) 0x00100100 + POISON_POINTER_DELTA) +#define LIST_POISON2 ((void *) 0x00200200 + POISON_POINTER_DELTA) /********** include/linux/timer.h **********/ /* -- cgit v1.2.3 From d23db9bbd91baaa1b22e7839e1e932bc6ae0582b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 14 Jul 2008 18:03:27 +0200 Subject: x86: change LIST_POISON to 0xdead000000000000 Linus observed: | > + default 0xffffc10000000000 if X86_64 | | This looks like a singularly bad pointer value on x86-64. | | Why not pick something that is *guaranteed* to fault? The above looks like | any future setup that supports 41 bits of addressing and has extended the | page tables (yes, it will happen eventually) will find that to be a | perfectly valid address? | | It's also visually confusing, since it's visually very close to a real | kernel pointer too. Correct - so change the poison value to 0xdead000000000000. Acked-by: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f09b3e5d808d..0dbd040bd654 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1022,7 +1022,7 @@ config ARCH_MEMORY_PROBE config ILLEGAL_POINTER_VALUE hex default 0 if X86_32 - default 0xffffc10000000000 if X86_64 + default 0xdead000000000000 if X86_64 source "mm/Kconfig" -- cgit v1.2.3 From d31ec1c47b10e268794e7b739b2009ac6a19eb8b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 22 Jul 2008 09:40:43 +0200 Subject: poison-pointers: clean up, add comments - remove stray 'L' (noticed by Andrew Morton) - add more comments about the purpose of this change - fix spelling (noticed by Andrew Morton) Signed-off-by: Ingo Molnar --- include/linux/poison.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/include/linux/poison.h b/include/linux/poison.h index 0d105a56e668..603d3bda31d9 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -1,13 +1,19 @@ #ifndef _LINUX_POISON_H #define _LINUX_POISON_H +/********** include/linux/list.h **********/ + +/* + * Architectures might want to move the poison pointer offset + * into some well-recognized area such as 0xdead000000000000, + * that is also not mappable by user-space exploits: + */ #ifdef CONFIG_ILLEGAL_POINTER_VALUE -#define POISON_POINTER_DELTA CONFIG_ILLEGAL_POINTER_VALUE +# define POISON_POINTER_DELTA CONFIG_ILLEGAL_POINTER_VALUE #else -#define POISON_POINTER_DELTA 0L +# define POISON_POINTER_DELTA 0 #endif -/********** include/linux/list.h **********/ /* * These are non-NULL pointers that will result in page faults * under normal circumstances, used to verify that nobody uses -- cgit v1.2.3 From 36b21230589ee51763afcc9ba2d126973f3ae96c Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 27 Dec 2008 15:21:36 +0530 Subject: x86: eliminate atleast 10684 sparse warnings Impact: cleanup, avoid 10000+ sparse warnings Eliminate 1434 sparse warnings like: include/linux/list.h:106:16: warning: constant 0xdead000000000000 is so big it is unsigned long Eliminate 1434 sparse warnings like: include/linux/list.h:107:16: warning: constant 0xdead000000000000 is so big it is unsigned long Eliminate 1434 sparse warnings like: include/linux/list.h:579:12: warning: constant 0xdead000000000000 is so big it is unsigned long Eliminate 1434 sparse warnings like: include/linux/list.h:580:13: warning: constant 0xdead000000000000 is so big it is unsigned long Eliminate 1237 sparse warnings like: include/linux/rculist.h:97:16: warning: constant 0xdead000000000000 is so big it is unsigned long Eliminate 1237 sparse warnings like: include/linux/rculist.h:143:14: warning: constant 0xdead000000000000 is so big it is unsigned long Eliminate 1237 sparse warnings like: include/linux/rculist.h:260:13: warning: constant 0xdead000000000000 is so big it is unsigned long Eliminate 1237 sparse warnings like: include/linux/rculist.h:280:15: warning: constant 0xdead000000000000 is so big it is unsigned long Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- include/linux/poison.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/poison.h b/include/linux/poison.h index 603d3bda31d9..f2de9a999f4a 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -9,7 +9,7 @@ * that is also not mappable by user-space exploits: */ #ifdef CONFIG_ILLEGAL_POINTER_VALUE -# define POISON_POINTER_DELTA CONFIG_ILLEGAL_POINTER_VALUE +# define POISON_POINTER_DELTA _AC(CONFIG_ILLEGAL_POINTER_VALUE, UL) #else # define POISON_POINTER_DELTA 0 #endif -- cgit v1.2.3 From 6ce51c431019310ca03371355a4366c4649fa349 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Wed, 1 Apr 2009 18:06:35 +0100 Subject: genirq: do not execute DEBUG_SHIRQ when irq setup failed When requesting an IRQ, the DEBUG_SHIRQ code executes a fake IRQ just to make sure the driver is ready to receive an IRQ immediately. The problem was that this fake IRQ was being executed even if interrupt line failed to be allocated by __setup_irq. Signed-off-by: Luis Henriques LKML-Reference: <20090401170635.GA4392@hades.domain.com> Signed-off-by: Thomas Gleixner [ fixed bug pointed out by a warning reported by Stephen Rothwell ] Cc: Stephen Rothwell Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1516ab77355c..8c68d5b95d48 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -768,7 +768,7 @@ int request_irq(unsigned int irq, irq_handler_t handler, kfree(action); #ifdef CONFIG_DEBUG_SHIRQ - if (irqflags & IRQF_SHARED) { + if (!retval && (irqflags & IRQF_SHARED)) { /* * It's a shared IRQ -- the driver ought to be prepared for it * to happen immediately, so let's make sure.... -- cgit v1.2.3 From b332828c39326b1dca617f387dd15d12e81cd5f0 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:43:10 +0530 Subject: hw-breakpoints: prepare the code for Hardware Breakpoint interfaces The generic hardware breakpoint interface provides an abstraction of hardware breakpoints in front of specific arch implementations for both kernel and user side breakpoints. This includes execution breakpoints and read/write breakpoints, also known as "watchpoints". This patch introduces header files containing constants, structure definitions and declaration of functions used by the hardware breakpoint core and x86 specific code. It also introduces an array based storage for the debug-register values in 'struct thread_struct', while modifying all users of debugreg member in the structure. [ Impact: add headers for new hardware breakpoint interface ] Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/include/asm/a.out-core.h | 8 +- arch/x86/include/asm/debugreg.h | 29 ++++++++ arch/x86/include/asm/hw_breakpoint.h | 55 ++++++++++++++ arch/x86/include/asm/processor.h | 8 +- arch/x86/kernel/process.c | 16 ++-- arch/x86/kernel/ptrace.c | 16 ++-- arch/x86/power/cpu_32.c | 8 +- arch/x86/power/cpu_64.c | 8 +- include/asm-generic/hw_breakpoint.h | 139 +++++++++++++++++++++++++++++++++++ 9 files changed, 255 insertions(+), 32 deletions(-) create mode 100644 arch/x86/include/asm/hw_breakpoint.h create mode 100644 include/asm-generic/hw_breakpoint.h diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h index bb70e397aa84..fc4685dd6e4d 100644 --- a/arch/x86/include/asm/a.out-core.h +++ b/arch/x86/include/asm/a.out-core.h @@ -32,10 +32,10 @@ static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump) >> PAGE_SHIFT; dump->u_dsize -= dump->u_tsize; dump->u_ssize = 0; - dump->u_debugreg[0] = current->thread.debugreg0; - dump->u_debugreg[1] = current->thread.debugreg1; - dump->u_debugreg[2] = current->thread.debugreg2; - dump->u_debugreg[3] = current->thread.debugreg3; + dump->u_debugreg[0] = current->thread.debugreg[0]; + dump->u_debugreg[1] = current->thread.debugreg[1]; + dump->u_debugreg[2] = current->thread.debugreg[2]; + dump->u_debugreg[3] = current->thread.debugreg[3]; dump->u_debugreg[4] = 0; dump->u_debugreg[5] = 0; dump->u_debugreg[6] = current->thread.debugreg6; diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 3ea6f37be9e2..23439fbb1d0e 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -18,6 +18,7 @@ #define DR_TRAP1 (0x2) /* db1 */ #define DR_TRAP2 (0x4) /* db2 */ #define DR_TRAP3 (0x8) /* db3 */ +#define DR_TRAP_BITS (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3) #define DR_STEP (0x4000) /* single-step */ #define DR_SWITCH (0x8000) /* task switch */ @@ -49,6 +50,8 @@ #define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */ #define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */ +#define DR_LOCAL_ENABLE (0x1) /* Local enable for reg 0 */ +#define DR_GLOBAL_ENABLE (0x2) /* Global enable for reg 0 */ #define DR_ENABLE_SIZE 2 /* 2 enable bits per register */ #define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */ @@ -67,4 +70,30 @@ #define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */ #define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */ +/* + * HW breakpoint additions + */ +#ifdef __KERNEL__ + +/* For process management */ +extern void flush_thread_hw_breakpoint(struct task_struct *tsk); +extern int copy_thread_hw_breakpoint(struct task_struct *tsk, + struct task_struct *child, unsigned long clone_flags); + +/* For CPU management */ +extern void load_debug_registers(void); +static inline void hw_breakpoint_disable(void) +{ + /* Zero the control register for HW Breakpoint */ + set_debugreg(0UL, 7); + + /* Zero-out the individual HW breakpoint address registers */ + set_debugreg(0UL, 0); + set_debugreg(0UL, 1); + set_debugreg(0UL, 2); + set_debugreg(0UL, 3); +} + +#endif /* __KERNEL__ */ + #endif /* _ASM_X86_DEBUGREG_H */ diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h new file mode 100644 index 000000000000..1acb4d45de70 --- /dev/null +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -0,0 +1,55 @@ +#ifndef _I386_HW_BREAKPOINT_H +#define _I386_HW_BREAKPOINT_H + +#ifdef __KERNEL__ +#define __ARCH_HW_BREAKPOINT_H + +struct arch_hw_breakpoint { + char *name; /* Contains name of the symbol to set bkpt */ + unsigned long address; + u8 len; + u8 type; +}; + +#include +#include + +/* Available HW breakpoint length encodings */ +#define HW_BREAKPOINT_LEN_1 0x40 +#define HW_BREAKPOINT_LEN_2 0x44 +#define HW_BREAKPOINT_LEN_4 0x4c +#define HW_BREAKPOINT_LEN_EXECUTE 0x40 + +#ifdef CONFIG_X86_64 +#define HW_BREAKPOINT_LEN_8 0x48 +#endif + +/* Available HW breakpoint type encodings */ + +/* trigger on instruction execute */ +#define HW_BREAKPOINT_EXECUTE 0x80 +/* trigger on memory write */ +#define HW_BREAKPOINT_WRITE 0x81 +/* trigger on memory read or write */ +#define HW_BREAKPOINT_RW 0x83 + +/* Total number of available HW breakpoint registers */ +#define HBP_NUM 4 + +extern struct hw_breakpoint *hbp_kernel[HBP_NUM]; +DECLARE_PER_CPU(struct hw_breakpoint*, this_hbp_kernel[HBP_NUM]); +extern unsigned int hbp_user_refcount[HBP_NUM]; + +extern void arch_install_thread_hw_breakpoint(struct task_struct *tsk); +extern void arch_uninstall_thread_hw_breakpoint(void); +extern int arch_check_va_in_userspace(unsigned long va, u8 hbp_len); +extern int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp, + struct task_struct *tsk); +extern void arch_update_user_hw_breakpoint(int pos, struct task_struct *tsk); +extern void arch_flush_thread_hw_breakpoint(struct task_struct *tsk); +extern void arch_update_kernel_hw_breakpoint(void *); +extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, + unsigned long val, void *data); +#endif /* __KERNEL__ */ +#endif /* _I386_HW_BREAKPOINT_H */ + diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 0b2fab0051e0..448b34a8e393 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -29,6 +29,7 @@ struct mm_struct; #include #include +#define HBP_NUM 4 /* * Default implementation of macro that returns current * instruction pointer ("program counter"). @@ -431,12 +432,11 @@ struct thread_struct { unsigned long fs; unsigned long gs; /* Hardware debugging registers: */ - unsigned long debugreg0; - unsigned long debugreg1; - unsigned long debugreg2; - unsigned long debugreg3; + unsigned long debugreg[HBP_NUM]; unsigned long debugreg6; unsigned long debugreg7; + /* Hardware breakpoint info */ + struct hw_breakpoint *hbp[HBP_NUM]; /* Fault info: */ unsigned long cr2; unsigned long trap_no; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index fb5dfb891f0f..291527cb438a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -106,10 +106,10 @@ void flush_thread(void) clear_tsk_thread_flag(tsk, TIF_DEBUG); - tsk->thread.debugreg0 = 0; - tsk->thread.debugreg1 = 0; - tsk->thread.debugreg2 = 0; - tsk->thread.debugreg3 = 0; + tsk->thread.debugreg[0] = 0; + tsk->thread.debugreg[1] = 0; + tsk->thread.debugreg[2] = 0; + tsk->thread.debugreg[3] = 0; tsk->thread.debugreg6 = 0; tsk->thread.debugreg7 = 0; memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); @@ -194,10 +194,10 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, update_debugctlmsr(next->debugctlmsr); if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { - set_debugreg(next->debugreg0, 0); - set_debugreg(next->debugreg1, 1); - set_debugreg(next->debugreg2, 2); - set_debugreg(next->debugreg3, 3); + set_debugreg(next->debugreg[0], 0); + set_debugreg(next->debugreg[1], 1); + set_debugreg(next->debugreg[2], 2); + set_debugreg(next->debugreg[3], 3); /* no 4 and 5 */ set_debugreg(next->debugreg6, 6); set_debugreg(next->debugreg7, 7); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 09ecbde91c13..313be40be55a 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -471,10 +471,10 @@ static int genregs_set(struct task_struct *target, static unsigned long ptrace_get_debugreg(struct task_struct *child, int n) { switch (n) { - case 0: return child->thread.debugreg0; - case 1: return child->thread.debugreg1; - case 2: return child->thread.debugreg2; - case 3: return child->thread.debugreg3; + case 0: return child->thread.debugreg[0]; + case 1: return child->thread.debugreg[1]; + case 2: return child->thread.debugreg[2]; + case 3: return child->thread.debugreg[3]; case 6: return child->thread.debugreg6; case 7: return child->thread.debugreg7; } @@ -493,10 +493,10 @@ static int ptrace_set_debugreg(struct task_struct *child, return -EIO; switch (n) { - case 0: child->thread.debugreg0 = data; break; - case 1: child->thread.debugreg1 = data; break; - case 2: child->thread.debugreg2 = data; break; - case 3: child->thread.debugreg3 = data; break; + case 0: child->thread.debugreg[0] = data; break; + case 1: child->thread.debugreg[1] = data; break; + case 2: child->thread.debugreg[2] = data; break; + case 3: child->thread.debugreg[3] = data; break; case 6: if ((data & ~0xffffffffUL) != 0) diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c index ce702c5b3a2c..519913948003 100644 --- a/arch/x86/power/cpu_32.c +++ b/arch/x86/power/cpu_32.c @@ -84,10 +84,10 @@ static void fix_processor_context(void) * Now maybe reload the debug registers */ if (current->thread.debugreg7) { - set_debugreg(current->thread.debugreg0, 0); - set_debugreg(current->thread.debugreg1, 1); - set_debugreg(current->thread.debugreg2, 2); - set_debugreg(current->thread.debugreg3, 3); + set_debugreg(current->thread.debugreg[0], 0); + set_debugreg(current->thread.debugreg[1], 1); + set_debugreg(current->thread.debugreg[2], 2); + set_debugreg(current->thread.debugreg[3], 3); /* no 4 and 5 */ set_debugreg(current->thread.debugreg6, 6); set_debugreg(current->thread.debugreg7, 7); diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c index 5343540f2607..1e3bdcc959ff 100644 --- a/arch/x86/power/cpu_64.c +++ b/arch/x86/power/cpu_64.c @@ -163,10 +163,10 @@ static void fix_processor_context(void) * Now maybe reload the debug registers */ if (current->thread.debugreg7){ - loaddebug(¤t->thread, 0); - loaddebug(¤t->thread, 1); - loaddebug(¤t->thread, 2); - loaddebug(¤t->thread, 3); + set_debugreg(current->thread.debugreg[0], 0); + set_debugreg(current->thread.debugreg[1], 1); + set_debugreg(current->thread.debugreg[2], 2); + set_debugreg(current->thread.debugreg[3], 3); /* no 4 and 5 */ loaddebug(¤t->thread, 6); loaddebug(¤t->thread, 7); diff --git a/include/asm-generic/hw_breakpoint.h b/include/asm-generic/hw_breakpoint.h new file mode 100644 index 000000000000..9bf2d12eb74a --- /dev/null +++ b/include/asm-generic/hw_breakpoint.h @@ -0,0 +1,139 @@ +#ifndef _ASM_GENERIC_HW_BREAKPOINT_H +#define _ASM_GENERIC_HW_BREAKPOINT_H + +#ifndef __ARCH_HW_BREAKPOINT_H +#error "Please don't include this file directly" +#endif + +#ifdef __KERNEL__ +#include +#include +#include + +/** + * struct hw_breakpoint - unified kernel/user-space hardware breakpoint + * @triggered: callback invoked after target address access + * @info: arch-specific breakpoint info (address, length, and type) + * + * %hw_breakpoint structures are the kernel's way of representing + * hardware breakpoints. These are data breakpoints + * (also known as "watchpoints", triggered on data access), and the breakpoint's + * target address can be located in either kernel space or user space. + * + * The breakpoint's address, length, and type are highly + * architecture-specific. The values are encoded in the @info field; you + * specify them when registering the breakpoint. To examine the encoded + * values use hw_breakpoint_get_{kaddress,uaddress,len,type}(), declared + * below. + * + * The address is specified as a regular kernel pointer (for kernel-space + * breakponts) or as an %__user pointer (for user-space breakpoints). + * With register_user_hw_breakpoint(), the address must refer to a + * location in user space. The breakpoint will be active only while the + * requested task is running. Conversely with + * register_kernel_hw_breakpoint(), the address must refer to a location + * in kernel space, and the breakpoint will be active on all CPUs + * regardless of the current task. + * + * The length is the breakpoint's extent in bytes, which is subject to + * certain limitations. include/asm/hw_breakpoint.h contains macros + * defining the available lengths for a specific architecture. Note that + * the address's alignment must match the length. The breakpoint will + * catch accesses to any byte in the range from address to address + + * (length - 1). + * + * The breakpoint's type indicates the sort of access that will cause it + * to trigger. Possible values may include: + * + * %HW_BREAKPOINT_RW (triggered on read or write access), + * %HW_BREAKPOINT_WRITE (triggered on write access), and + * %HW_BREAKPOINT_READ (triggered on read access). + * + * Appropriate macros are defined in include/asm/hw_breakpoint.h; not all + * possibilities are available on all architectures. Execute breakpoints + * must have length equal to the special value %HW_BREAKPOINT_LEN_EXECUTE. + * + * When a breakpoint gets hit, the @triggered callback is + * invoked in_interrupt with a pointer to the %hw_breakpoint structure and the + * processor registers. + * Data breakpoints occur after the memory access has taken place. + * Breakpoints are disabled during execution @triggered, to avoid + * recursive traps and allow unhindered access to breakpointed memory. + * + * This sample code sets a breakpoint on pid_max and registers a callback + * function for writes to that variable. Note that it is not portable + * as written, because not all architectures support HW_BREAKPOINT_LEN_4. + * + * ---------------------------------------------------------------------- + * + * #include + * + * struct hw_breakpoint my_bp; + * + * static void my_triggered(struct hw_breakpoint *bp, struct pt_regs *regs) + * { + * printk(KERN_DEBUG "Inside triggered routine of breakpoint exception\n"); + * dump_stack(); + * ............... + * } + * + * static struct hw_breakpoint my_bp; + * + * static int init_module(void) + * { + * ...................... + * my_bp.info.type = HW_BREAKPOINT_WRITE; + * my_bp.info.len = HW_BREAKPOINT_LEN_4; + * + * my_bp.installed = (void *)my_bp_installed; + * + * rc = register_kernel_hw_breakpoint(&my_bp); + * ...................... + * } + * + * static void cleanup_module(void) + * { + * ...................... + * unregister_kernel_hw_breakpoint(&my_bp); + * ...................... + * } + * + * ---------------------------------------------------------------------- + */ +struct hw_breakpoint { + void (*triggered)(struct hw_breakpoint *, struct pt_regs *); + struct arch_hw_breakpoint info; +}; + +/* + * len and type values are defined in include/asm/hw_breakpoint.h. + * Available values vary according to the architecture. On i386 the + * possibilities are: + * + * HW_BREAKPOINT_LEN_1 + * HW_BREAKPOINT_LEN_2 + * HW_BREAKPOINT_LEN_4 + * HW_BREAKPOINT_RW + * HW_BREAKPOINT_READ + * + * On other architectures HW_BREAKPOINT_LEN_8 may be available, and the + * 1-, 2-, and 4-byte lengths may be unavailable. There also may be + * HW_BREAKPOINT_WRITE. You can use #ifdef to check at compile time. + */ + +extern int register_user_hw_breakpoint(struct task_struct *tsk, + struct hw_breakpoint *bp); +extern int modify_user_hw_breakpoint(struct task_struct *tsk, + struct hw_breakpoint *bp); +extern void unregister_user_hw_breakpoint(struct task_struct *tsk, + struct hw_breakpoint *bp); +/* + * Kernel breakpoints are not associated with any particular thread. + */ +extern int register_kernel_hw_breakpoint(struct hw_breakpoint *bp); +extern void unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp); + +extern unsigned int hbp_kernel_pos; + +#endif /* __KERNEL__ */ +#endif /* _ASM_GENERIC_HW_BREAKPOINT_H */ -- cgit v1.2.3 From 62a038d34db26771756cf3689e36de638bedd2c4 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:43:33 +0530 Subject: hw-breakpoints: introducing generic hardware breakpoint handler interfaces This patch introduces the generic Hardware Breakpoint interfaces for both user and kernel space requests. This core Api handles the hardware breakpoints through new helpers. It handles the user-space breakpoints and kernel breakpoints in front of arch implementation. One can choose kernel wide breakpoints using the following helpers and passing them a generic struct hw_breakpoint: - register_kernel_hw_breakpoint() - unregister_kernel_hw_breakpoint() - modify_kernel_hw_breakpoint() On the other side, you can choose per task breakpoints. - register_user_hw_breakpoint() - unregister_user_hw_breakpoint() - modify_user_hw_breakpoint() [ fweisbec@gmail.com: fix conflict against perfcounter ] Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/Kconfig | 4 + kernel/Makefile | 1 + kernel/hw_breakpoint.c | 378 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 383 insertions(+) create mode 100644 kernel/hw_breakpoint.c diff --git a/arch/Kconfig b/arch/Kconfig index 78a35e9dc104..1adf2d0e6356 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -112,3 +112,7 @@ config HAVE_DMA_API_DEBUG config HAVE_DEFAULT_NO_SPIN_MUTEXES bool + +config HAVE_HW_BREAKPOINT + bool + diff --git a/kernel/Makefile b/kernel/Makefile index a35eee3436de..18ad1110b226 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -96,6 +96,7 @@ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SLOW_WORK) += slow-work.o +obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c new file mode 100644 index 000000000000..c1f64e65a9f3 --- /dev/null +++ b/kernel/hw_breakpoint.c @@ -0,0 +1,378 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2007 Alan Stern + * Copyright (C) IBM Corporation, 2009 + */ + +/* + * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, + * using the CPU's debug registers. + * This file contains the arch-independent routines. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef CONFIG_X86 +#include +#endif +/* + * Spinlock that protects all (un)register operations over kernel/user-space + * breakpoint requests + */ +static DEFINE_SPINLOCK(hw_breakpoint_lock); + +/* Array of kernel-space breakpoint structures */ +struct hw_breakpoint *hbp_kernel[HBP_NUM]; + +/* + * Per-processor copy of hbp_kernel[]. Used only when hbp_kernel is being + * modified but we need the older copy to handle any hbp exceptions. It will + * sync with hbp_kernel[] value after updation is done through IPIs. + */ +DEFINE_PER_CPU(struct hw_breakpoint*, this_hbp_kernel[HBP_NUM]); + +/* + * Kernel breakpoints grow downwards, starting from HBP_NUM + * 'hbp_kernel_pos' denotes lowest numbered breakpoint register occupied for + * kernel-space request. We will initialise it here and not in an __init + * routine because load_debug_registers(), which uses this variable can be + * called very early during CPU initialisation. + */ +unsigned int hbp_kernel_pos = HBP_NUM; + +/* + * An array containing refcount of threads using a given bkpt register + * Accesses are synchronised by acquiring hw_breakpoint_lock + */ +unsigned int hbp_user_refcount[HBP_NUM]; + +/* + * Load the debug registers during startup of a CPU. + */ +void load_debug_registers(void) +{ + unsigned long flags; + struct task_struct *tsk = current; + + spin_lock_bh(&hw_breakpoint_lock); + + /* Prevent IPIs for new kernel breakpoint updates */ + local_irq_save(flags); + arch_update_kernel_hw_breakpoint(NULL); + local_irq_restore(flags); + + if (test_tsk_thread_flag(tsk, TIF_DEBUG)) + arch_install_thread_hw_breakpoint(tsk); + + spin_unlock_bh(&hw_breakpoint_lock); +} + +/* + * Erase all the hardware breakpoint info associated with a thread. + * + * If tsk != current then tsk must not be usable (for example, a + * child being cleaned up from a failed fork). + */ +void flush_thread_hw_breakpoint(struct task_struct *tsk) +{ + int i; + struct thread_struct *thread = &(tsk->thread); + + spin_lock_bh(&hw_breakpoint_lock); + + /* The thread no longer has any breakpoints associated with it */ + clear_tsk_thread_flag(tsk, TIF_DEBUG); + for (i = 0; i < HBP_NUM; i++) { + if (thread->hbp[i]) { + hbp_user_refcount[i]--; + kfree(thread->hbp[i]); + thread->hbp[i] = NULL; + } + } + + arch_flush_thread_hw_breakpoint(tsk); + + /* Actually uninstall the breakpoints if necessary */ + if (tsk == current) + arch_uninstall_thread_hw_breakpoint(); + spin_unlock_bh(&hw_breakpoint_lock); +} + +/* + * Copy the hardware breakpoint info from a thread to its cloned child. + */ +int copy_thread_hw_breakpoint(struct task_struct *tsk, + struct task_struct *child, unsigned long clone_flags) +{ + /* + * We will assume that breakpoint settings are not inherited + * and the child starts out with no debug registers set. + * But what about CLONE_PTRACE? + */ + clear_tsk_thread_flag(child, TIF_DEBUG); + + /* We will call flush routine since the debugregs are not inherited */ + arch_flush_thread_hw_breakpoint(child); + + return 0; +} + +static int __register_user_hw_breakpoint(int pos, struct task_struct *tsk, + struct hw_breakpoint *bp) +{ + struct thread_struct *thread = &(tsk->thread); + int rc; + + /* Do not overcommit. Fail if kernel has used the hbp registers */ + if (pos >= hbp_kernel_pos) + return -ENOSPC; + + rc = arch_validate_hwbkpt_settings(bp, tsk); + if (rc) + return rc; + + thread->hbp[pos] = bp; + hbp_user_refcount[pos]++; + + arch_update_user_hw_breakpoint(pos, tsk); + /* + * Does it need to be installed right now? + * Otherwise it will get installed the next time tsk runs + */ + if (tsk == current) + arch_install_thread_hw_breakpoint(tsk); + + return rc; +} + +/* + * Modify the address of a hbp register already in use by the task + * Do not invoke this in-lieu of a __unregister_user_hw_breakpoint() + */ +static int __modify_user_hw_breakpoint(int pos, struct task_struct *tsk, + struct hw_breakpoint *bp) +{ + struct thread_struct *thread = &(tsk->thread); + + if ((pos >= hbp_kernel_pos) || (arch_validate_hwbkpt_settings(bp, tsk))) + return -EINVAL; + + if (thread->hbp[pos] == NULL) + return -EINVAL; + + thread->hbp[pos] = bp; + /* + * 'pos' must be that of a hbp register already used by 'tsk' + * Otherwise arch_modify_user_hw_breakpoint() will fail + */ + arch_update_user_hw_breakpoint(pos, tsk); + + if (tsk == current) + arch_install_thread_hw_breakpoint(tsk); + + return 0; +} + +static void __unregister_user_hw_breakpoint(int pos, struct task_struct *tsk) +{ + hbp_user_refcount[pos]--; + tsk->thread.hbp[pos] = NULL; + + arch_update_user_hw_breakpoint(pos, tsk); + + if (tsk == current) + arch_install_thread_hw_breakpoint(tsk); +} + +/** + * register_user_hw_breakpoint - register a hardware breakpoint for user space + * @tsk: pointer to 'task_struct' of the process to which the address belongs + * @bp: the breakpoint structure to register + * + * @bp.info->name or @bp.info->address, @bp.info->len, @bp.info->type and + * @bp->triggered must be set properly before invocation + * + */ +int register_user_hw_breakpoint(struct task_struct *tsk, + struct hw_breakpoint *bp) +{ + struct thread_struct *thread = &(tsk->thread); + int i, rc = -ENOSPC; + + spin_lock_bh(&hw_breakpoint_lock); + + for (i = 0; i < hbp_kernel_pos; i++) { + if (!thread->hbp[i]) { + rc = __register_user_hw_breakpoint(i, tsk, bp); + break; + } + } + if (!rc) + set_tsk_thread_flag(tsk, TIF_DEBUG); + + spin_unlock_bh(&hw_breakpoint_lock); + return rc; +} +EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); + +/** + * modify_user_hw_breakpoint - modify a user-space hardware breakpoint + * @tsk: pointer to 'task_struct' of the process to which the address belongs + * @bp: the breakpoint structure to unregister + * + */ +int modify_user_hw_breakpoint(struct task_struct *tsk, struct hw_breakpoint *bp) +{ + struct thread_struct *thread = &(tsk->thread); + int i, ret = -ENOENT; + + spin_lock_bh(&hw_breakpoint_lock); + for (i = 0; i < hbp_kernel_pos; i++) { + if (bp == thread->hbp[i]) { + ret = __modify_user_hw_breakpoint(i, tsk, bp); + break; + } + } + spin_unlock_bh(&hw_breakpoint_lock); + return ret; +} +EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); + +/** + * unregister_user_hw_breakpoint - unregister a user-space hardware breakpoint + * @tsk: pointer to 'task_struct' of the process to which the address belongs + * @bp: the breakpoint structure to unregister + * + */ +void unregister_user_hw_breakpoint(struct task_struct *tsk, + struct hw_breakpoint *bp) +{ + struct thread_struct *thread = &(tsk->thread); + int i, pos = -1, hbp_counter = 0; + + spin_lock_bh(&hw_breakpoint_lock); + for (i = 0; i < hbp_kernel_pos; i++) { + if (thread->hbp[i]) + hbp_counter++; + if (bp == thread->hbp[i]) + pos = i; + } + if (pos >= 0) { + __unregister_user_hw_breakpoint(pos, tsk); + hbp_counter--; + } + if (!hbp_counter) + clear_tsk_thread_flag(tsk, TIF_DEBUG); + + spin_unlock_bh(&hw_breakpoint_lock); +} +EXPORT_SYMBOL_GPL(unregister_user_hw_breakpoint); + +/** + * register_kernel_hw_breakpoint - register a hardware breakpoint for kernel space + * @bp: the breakpoint structure to register + * + * @bp.info->name or @bp.info->address, @bp.info->len, @bp.info->type and + * @bp->triggered must be set properly before invocation + * + */ +int register_kernel_hw_breakpoint(struct hw_breakpoint *bp) +{ + int rc; + + rc = arch_validate_hwbkpt_settings(bp, NULL); + if (rc) + return rc; + + spin_lock_bh(&hw_breakpoint_lock); + + rc = -ENOSPC; + /* Check if we are over-committing */ + if ((hbp_kernel_pos > 0) && (!hbp_user_refcount[hbp_kernel_pos-1])) { + hbp_kernel_pos--; + hbp_kernel[hbp_kernel_pos] = bp; + on_each_cpu(arch_update_kernel_hw_breakpoint, NULL, 1); + rc = 0; + } + + spin_unlock_bh(&hw_breakpoint_lock); + return rc; +} +EXPORT_SYMBOL_GPL(register_kernel_hw_breakpoint); + +/** + * unregister_kernel_hw_breakpoint - unregister a HW breakpoint for kernel space + * @bp: the breakpoint structure to unregister + * + * Uninstalls and unregisters @bp. + */ +void unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp) +{ + int i, j; + + spin_lock_bh(&hw_breakpoint_lock); + + /* Find the 'bp' in our list of breakpoints for kernel */ + for (i = hbp_kernel_pos; i < HBP_NUM; i++) + if (bp == hbp_kernel[i]) + break; + + /* Check if we did not find a match for 'bp'. If so return early */ + if (i == HBP_NUM) { + spin_unlock_bh(&hw_breakpoint_lock); + return; + } + + /* + * We'll shift the breakpoints one-level above to compact if + * unregistration creates a hole + */ + for (j = i; j > hbp_kernel_pos; j--) + hbp_kernel[j] = hbp_kernel[j-1]; + + hbp_kernel[hbp_kernel_pos] = NULL; + on_each_cpu(arch_update_kernel_hw_breakpoint, NULL, 1); + hbp_kernel_pos++; + + spin_unlock_bh(&hw_breakpoint_lock); +} +EXPORT_SYMBOL_GPL(unregister_kernel_hw_breakpoint); + +static struct notifier_block hw_breakpoint_exceptions_nb = { + .notifier_call = hw_breakpoint_exceptions_notify, + /* we need to be notified first */ + .priority = 0x7fffffff +}; + +static int __init init_hw_breakpoint(void) +{ + return register_die_notifier(&hw_breakpoint_exceptions_nb); +} + +core_initcall(init_hw_breakpoint); -- cgit v1.2.3 From 0067f1297241ea567f2b22a455519752d70fcca9 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:43:57 +0530 Subject: hw-breakpoints: x86 architecture implementation of Hardware Breakpoint interfaces This patch introduces the arch-specific implementation of the generic hardware breakpoints in kernel/hw_breakpoint.c inside x86 specific directories. It contains functions which help to validate and serve requests using Hardware Breakpoint registers on x86 processors. [ fweisbec@gmail.com: fix conflict against kmemcheck ] Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/Kconfig | 1 + arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/hw_breakpoint.c | 382 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 384 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/hw_breakpoint.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index df9e885eee14..3033375ed6bc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -46,6 +46,7 @@ config X86 select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA + select HAVE_HW_BREAKPOINT config ARCH_DEFCONFIG string diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 77df4d654ff9..cbc781829173 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -36,7 +36,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o -obj-y += alternative.o i8253.o pci-nommu.o +obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o obj-y += tsc.o io_delay.o rtc.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c new file mode 100644 index 000000000000..4867c9f3b5fb --- /dev/null +++ b/arch/x86/kernel/hw_breakpoint.c @@ -0,0 +1,382 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2007 Alan Stern + * Copyright (C) 2009 IBM Corporation + */ + +/* + * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, + * using the CPU's debug registers. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* Unmasked kernel DR7 value */ +static unsigned long kdr7; + +/* + * Masks for the bits corresponding to registers DR0 - DR3 in DR7 register. + * Used to clear and verify the status of bits corresponding to DR0 - DR3 + */ +static const unsigned long dr7_masks[HBP_NUM] = { + 0x000f0003, /* LEN0, R/W0, G0, L0 */ + 0x00f0000c, /* LEN1, R/W1, G1, L1 */ + 0x0f000030, /* LEN2, R/W2, G2, L2 */ + 0xf00000c0 /* LEN3, R/W3, G3, L3 */ +}; + + +/* + * Encode the length, type, Exact, and Enable bits for a particular breakpoint + * as stored in debug register 7. + */ +static unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) +{ + unsigned long bp_info; + + bp_info = (len | type) & 0xf; + bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE); + bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)) | + DR_GLOBAL_SLOWDOWN; + return bp_info; +} + +void arch_update_kernel_hw_breakpoint(void *unused) +{ + struct hw_breakpoint *bp; + int i, cpu = get_cpu(); + unsigned long temp_kdr7 = 0; + + /* Don't allow debug exceptions while we update the registers */ + set_debugreg(0UL, 7); + + for (i = hbp_kernel_pos; i < HBP_NUM; i++) { + per_cpu(this_hbp_kernel[i], cpu) = bp = hbp_kernel[i]; + if (bp) { + temp_kdr7 |= encode_dr7(i, bp->info.len, bp->info.type); + set_debugreg(bp->info.address, i); + } + } + + /* No need to set DR6. Update the debug registers with kernel-space + * breakpoint values from kdr7 and user-space requests from the + * current process + */ + kdr7 = temp_kdr7; + set_debugreg(kdr7 | current->thread.debugreg7, 7); + put_cpu_no_resched(); +} + +/* + * Install the thread breakpoints in their debug registers. + */ +void arch_install_thread_hw_breakpoint(struct task_struct *tsk) +{ + struct thread_struct *thread = &(tsk->thread); + + switch (hbp_kernel_pos) { + case 4: + set_debugreg(thread->debugreg[3], 3); + case 3: + set_debugreg(thread->debugreg[2], 2); + case 2: + set_debugreg(thread->debugreg[1], 1); + case 1: + set_debugreg(thread->debugreg[0], 0); + default: + break; + } + + /* No need to set DR6 */ + set_debugreg((kdr7 | thread->debugreg7), 7); +} + +/* + * Install the debug register values for just the kernel, no thread. + */ +void arch_uninstall_thread_hw_breakpoint() +{ + /* Clear the user-space portion of debugreg7 by setting only kdr7 */ + set_debugreg(kdr7, 7); + +} + +static int get_hbp_len(u8 hbp_len) +{ + unsigned int len_in_bytes = 0; + + switch (hbp_len) { + case HW_BREAKPOINT_LEN_1: + len_in_bytes = 1; + break; + case HW_BREAKPOINT_LEN_2: + len_in_bytes = 2; + break; + case HW_BREAKPOINT_LEN_4: + len_in_bytes = 4; + break; +#ifdef CONFIG_X86_64 + case HW_BREAKPOINT_LEN_8: + len_in_bytes = 8; + break; +#endif + } + return len_in_bytes; +} + +/* + * Check for virtual address in user space. + */ +int arch_check_va_in_userspace(unsigned long va, u8 hbp_len) +{ + unsigned int len; + + len = get_hbp_len(hbp_len); + + return (va <= TASK_SIZE - len); +} + +/* + * Check for virtual address in kernel space. + */ +int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) +{ + unsigned int len; + + len = get_hbp_len(hbp_len); + + return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); +} + +/* + * Store a breakpoint's encoded address, length, and type. + */ +static int arch_store_info(struct hw_breakpoint *bp, struct task_struct *tsk) +{ + /* + * User-space requests will always have the address field populated + * Symbol names from user-space are rejected + */ + if (tsk && bp->info.name) + return -EINVAL; + /* + * For kernel-addresses, either the address or symbol name can be + * specified. + */ + if (bp->info.name) + bp->info.address = (unsigned long) + kallsyms_lookup_name(bp->info.name); + if (bp->info.address) + return 0; + return -EINVAL; +} + +/* + * Validate the arch-specific HW Breakpoint register settings + */ +int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp, + struct task_struct *tsk) +{ + unsigned int align; + int ret = -EINVAL; + + switch (bp->info.type) { + /* + * Ptrace-refactoring code + * For now, we'll allow instruction breakpoint only for user-space + * addresses + */ + case HW_BREAKPOINT_EXECUTE: + if ((!arch_check_va_in_userspace(bp->info.address, + bp->info.len)) && + bp->info.len != HW_BREAKPOINT_LEN_EXECUTE) + return ret; + break; + case HW_BREAKPOINT_WRITE: + break; + case HW_BREAKPOINT_RW: + break; + default: + return ret; + } + + switch (bp->info.len) { + case HW_BREAKPOINT_LEN_1: + align = 0; + break; + case HW_BREAKPOINT_LEN_2: + align = 1; + break; + case HW_BREAKPOINT_LEN_4: + align = 3; + break; +#ifdef CONFIG_X86_64 + case HW_BREAKPOINT_LEN_8: + align = 7; + break; +#endif + default: + return ret; + } + + if (bp->triggered) + ret = arch_store_info(bp, tsk); + + if (ret < 0) + return ret; + /* + * Check that the low-order bits of the address are appropriate + * for the alignment implied by len. + */ + if (bp->info.address & align) + return -EINVAL; + + /* Check that the virtual address is in the proper range */ + if (tsk) { + if (!arch_check_va_in_userspace(bp->info.address, bp->info.len)) + return -EFAULT; + } else { + if (!arch_check_va_in_kernelspace(bp->info.address, + bp->info.len)) + return -EFAULT; + } + return 0; +} + +void arch_update_user_hw_breakpoint(int pos, struct task_struct *tsk) +{ + struct thread_struct *thread = &(tsk->thread); + struct hw_breakpoint *bp = thread->hbp[pos]; + + thread->debugreg7 &= ~dr7_masks[pos]; + if (bp) { + thread->debugreg[pos] = bp->info.address; + thread->debugreg7 |= encode_dr7(pos, bp->info.len, + bp->info.type); + } else + thread->debugreg[pos] = 0; +} + +void arch_flush_thread_hw_breakpoint(struct task_struct *tsk) +{ + int i; + struct thread_struct *thread = &(tsk->thread); + + thread->debugreg7 = 0; + for (i = 0; i < HBP_NUM; i++) + thread->debugreg[i] = 0; +} + +/* + * Handle debug exception notifications. + * + * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below. + * + * NOTIFY_DONE returned if one of the following conditions is true. + * i) When the causative address is from user-space and the exception + * is a valid one, i.e. not triggered as a result of lazy debug register + * switching + * ii) When there are more bits than trap set in DR6 register (such + * as BD, BS or BT) indicating that more than one debug condition is + * met and requires some more action in do_debug(). + * + * NOTIFY_STOP returned for all other cases + * + */ +int __kprobes hw_breakpoint_handler(struct die_args *args) +{ + int i, cpu, rc = NOTIFY_STOP; + struct hw_breakpoint *bp; + /* The DR6 value is stored in args->err */ + unsigned long dr7, dr6 = args->err; + + /* Do an early return if no trap bits are set in DR6 */ + if ((dr6 & DR_TRAP_BITS) == 0) + return NOTIFY_DONE; + + /* Lazy debug register switching */ + if (!test_tsk_thread_flag(current, TIF_DEBUG)) + arch_uninstall_thread_hw_breakpoint(); + + get_debugreg(dr7, 7); + /* Disable breakpoints during exception handling */ + set_debugreg(0UL, 7); + /* + * Assert that local interrupts are disabled + * Reset the DRn bits in the virtualized register value. + * The ptrace trigger routine will add in whatever is needed. + */ + current->thread.debugreg6 &= ~DR_TRAP_BITS; + cpu = get_cpu(); + + /* Handle all the breakpoints that were triggered */ + for (i = 0; i < HBP_NUM; ++i) { + if (likely(!(dr6 & (DR_TRAP0 << i)))) + continue; + /* + * Find the corresponding hw_breakpoint structure and + * invoke its triggered callback. + */ + if (i >= hbp_kernel_pos) + bp = per_cpu(this_hbp_kernel[i], cpu); + else { + bp = current->thread.hbp[i]; + if (bp) + rc = NOTIFY_DONE; + } + /* + * bp can be NULL due to lazy debug register switching + * or due to the delay between updates of hbp_kernel_pos + * and this_hbp_kernel. + */ + if (!bp) + continue; + + (bp->triggered)(bp, args->regs); + } + if (dr6 & (~DR_TRAP_BITS)) + rc = NOTIFY_DONE; + + set_debugreg(dr7, 7); + put_cpu_no_resched(); + return rc; +} + +/* + * Handle debug exception notifications. + */ +int __kprobes hw_breakpoint_exceptions_notify( + struct notifier_block *unused, unsigned long val, void *data) +{ + if (val != DIE_DEBUG) + return NOTIFY_DONE; + + return hw_breakpoint_handler(data); +} -- cgit v1.2.3 From 08d68323d1f0c34452e614263b212ca556dae47f Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:44:08 +0530 Subject: hw-breakpoints: modifying generic debug exception to use thread-specific debug registers This patch modifies the breakpoint exception handler code to use the new abstract debug register names. [ fweisbec@gmail.com: fix conflict against kmemcheck ] [ Impact: refactor and cleanup x86 debug exception handler ] Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/traps.c | 69 +++++++++++++++++-------------------------------- 1 file changed, 24 insertions(+), 45 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a1d288327ff0..de9913247dd0 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -529,73 +529,52 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; - unsigned long condition; + unsigned long dr6; int si_code; - get_debugreg(condition, 6); + get_debugreg(dr6, 6); + /* DR6 may or may not be cleared by the CPU */ + set_debugreg(0, 6); /* * The processor cleared BTF, so don't mark that we need it set. */ clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); tsk->thread.debugctlmsr = 0; - if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, + /* Store the virtualized DR6 value */ + tsk->thread.debugreg6 = dr6; + + if (notify_die(DIE_DEBUG, "debug", regs, dr6, error_code, SIGTRAP) == NOTIFY_STOP) return; /* It's safe to allow irq's after DR6 has been saved */ preempt_conditional_sti(regs); - /* Mask out spurious debug traps due to lazy DR7 setting */ - if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg7) - goto clear_dr7; + if (regs->flags & X86_VM_MASK) { + handle_vm86_trap((struct kernel_vm86_regs *) regs, + error_code, 1); + return; } -#ifdef CONFIG_X86_32 - if (regs->flags & X86_VM_MASK) - goto debug_vm86; -#endif - - /* Save debug status register where ptrace can see it */ - tsk->thread.debugreg6 = condition; - /* - * Single-stepping through TF: make sure we ignore any events in - * kernel space (but re-enable TF when returning to user mode). + * Single-stepping through system calls: ignore any exceptions in + * kernel space, but re-enable TF when returning to user mode. + * + * We already checked v86 mode above, so we can check for kernel mode + * by just checking the CPL of CS. */ - if (condition & DR_STEP) { - if (!user_mode(regs)) - goto clear_TF_reenable; + if ((dr6 & DR_STEP) && !user_mode(regs)) { + tsk->thread.debugreg6 &= ~DR_STEP; + set_tsk_thread_flag(tsk, TIF_SINGLESTEP); + regs->flags &= ~X86_EFLAGS_TF; } - - si_code = get_si_code(condition); - /* Ok, finally something we can handle */ - send_sigtrap(tsk, regs, error_code, si_code); - - /* - * Disable additional traps. They'll be re-enabled when - * the signal is delivered. - */ -clear_dr7: - set_debugreg(0, 7); + si_code = get_si_code(tsk->thread.debugreg6); + if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS)) + send_sigtrap(tsk, regs, error_code, si_code); preempt_conditional_cli(regs); - return; -#ifdef CONFIG_X86_32 -debug_vm86: - /* reenable preemption: handle_vm86_trap() might sleep */ - dec_preempt_count(); - handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); - conditional_cli(regs); - return; -#endif - -clear_TF_reenable: - set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->flags &= ~X86_EFLAGS_TF; - preempt_conditional_cli(regs); return; } -- cgit v1.2.3 From 1e3500666f7c5daaadadb8431a2927cdbbdb7dd4 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:44:26 +0530 Subject: hw-breakpoints: use wrapper routines around debug registers in processor related functions This patch enables the use of wrapper routines to access the debug/breakpoint registers on cpu management. The hardcoded debug registers save and restore operations for threads breakpoints are replaced by wrappers. And now that we handle the kernel breakpoints too, we also need to handle them on cpu hotplug operations. [ Impact: adapt new hardware breakpoint api to cpu hotplug ] Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/smpboot.c | 3 +++ arch/x86/power/cpu_32.c | 13 +++---------- arch/x86/power/cpu_64.c | 12 +++--------- 3 files changed, 9 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 58d24ef917d8..2b2652d205c0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include @@ -326,6 +327,7 @@ notrace static void __cpuinit start_secondary(void *unused) setup_secondary_clock(); wmb(); + load_debug_registers(); cpu_idle(); } @@ -1250,6 +1252,7 @@ void cpu_disable_common(void) remove_cpu_from_maps(cpu); unlock_vector_lock(); fixup_irqs(); + hw_breakpoint_disable(); } int native_cpu_disable(void) diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c index 519913948003..2bc3b016de90 100644 --- a/arch/x86/power/cpu_32.c +++ b/arch/x86/power/cpu_32.c @@ -13,6 +13,7 @@ #include #include #include +#include static struct saved_context saved_context; @@ -48,6 +49,7 @@ static void __save_processor_state(struct saved_context *ctxt) ctxt->cr2 = read_cr2(); ctxt->cr3 = read_cr3(); ctxt->cr4 = read_cr4_safe(); + hw_breakpoint_disable(); } /* Needed by apm.c */ @@ -83,16 +85,7 @@ static void fix_processor_context(void) /* * Now maybe reload the debug registers */ - if (current->thread.debugreg7) { - set_debugreg(current->thread.debugreg[0], 0); - set_debugreg(current->thread.debugreg[1], 1); - set_debugreg(current->thread.debugreg[2], 2); - set_debugreg(current->thread.debugreg[3], 3); - /* no 4 and 5 */ - set_debugreg(current->thread.debugreg6, 6); - set_debugreg(current->thread.debugreg7, 7); - } - + load_debug_registers(); } static void __restore_processor_state(struct saved_context *ctxt) diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c index 1e3bdcc959ff..46866a13a93a 100644 --- a/arch/x86/power/cpu_64.c +++ b/arch/x86/power/cpu_64.c @@ -16,6 +16,7 @@ #include #include #include +#include static void fix_processor_context(void); @@ -71,6 +72,7 @@ static void __save_processor_state(struct saved_context *ctxt) ctxt->cr3 = read_cr3(); ctxt->cr4 = read_cr4(); ctxt->cr8 = read_cr8(); + hw_breakpoint_disable(); } void save_processor_state(void) @@ -162,13 +164,5 @@ static void fix_processor_context(void) /* * Now maybe reload the debug registers */ - if (current->thread.debugreg7){ - set_debugreg(current->thread.debugreg[0], 0); - set_debugreg(current->thread.debugreg[1], 1); - set_debugreg(current->thread.debugreg[2], 2); - set_debugreg(current->thread.debugreg[3], 3); - /* no 4 and 5 */ - loaddebug(¤t->thread, 6); - loaddebug(¤t->thread, 7); - } + load_debug_registers(); } -- cgit v1.2.3 From 66cb5917295958652ff6ba36d83f98f2379c46b4 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:44:55 +0530 Subject: hw-breakpoints: use the new wrapper routines to access debug registers in process/thread code This patch enables the use of abstract debug registers in process-handling routines, according to the new hardware breakpoint Api. [ Impact: adapt thread breakpoints handling code to the new breakpoint Api ] Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/process.c | 22 ++++++---------------- arch/x86/kernel/process_32.c | 28 ++++++++++++++++++++++++++++ arch/x86/kernel/process_64.c | 31 +++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 291527cb438a..19a686c401b5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -15,6 +15,8 @@ #include #include #include +#include +#include unsigned long idle_halt; EXPORT_SYMBOL(idle_halt); @@ -46,6 +48,8 @@ void free_thread_xstate(struct task_struct *tsk) kmem_cache_free(task_xstate_cachep, tsk->thread.xstate); tsk->thread.xstate = NULL; } + if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG))) + flush_thread_hw_breakpoint(tsk); WARN(tsk->thread.ds_ctx, "leaking DS context\n"); } @@ -106,12 +110,8 @@ void flush_thread(void) clear_tsk_thread_flag(tsk, TIF_DEBUG); - tsk->thread.debugreg[0] = 0; - tsk->thread.debugreg[1] = 0; - tsk->thread.debugreg[2] = 0; - tsk->thread.debugreg[3] = 0; - tsk->thread.debugreg6 = 0; - tsk->thread.debugreg7 = 0; + if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG))) + flush_thread_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* * Forget coprocessor state.. @@ -193,16 +193,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, else if (next->debugctlmsr != prev->debugctlmsr) update_debugctlmsr(next->debugctlmsr); - if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { - set_debugreg(next->debugreg[0], 0); - set_debugreg(next->debugreg[1], 1); - set_debugreg(next->debugreg[2], 2); - set_debugreg(next->debugreg[3], 3); - /* no 4 and 5 */ - set_debugreg(next->debugreg6, 6); - set_debugreg(next->debugreg7, 7); - } - if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ test_tsk_thread_flag(next_p, TIF_NOTSC)) { /* prev and next are different */ diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index b5e4bfef4472..297ffff2ffc2 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -61,6 +61,8 @@ #include #include #include +#include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -265,7 +267,13 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, task_user_gs(p) = get_user_gs(regs); + p->thread.io_bitmap_ptr = NULL; tsk = current; + err = -ENOMEM; + if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG))) + if (copy_thread_hw_breakpoint(tsk, p, clone_flags)) + goto out; + if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, IO_BITMAP_BYTES, GFP_KERNEL); @@ -285,10 +293,13 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, err = do_set_thread_area(p, -1, (struct user_desc __user *)childregs->si, 0); +out: if (err && p->thread.io_bitmap_ptr) { kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } + if (err) + flush_thread_hw_breakpoint(p); clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); p->thread.ds_ctx = NULL; @@ -427,6 +438,23 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) lazy_load_gs(next->gs); percpu_write(current_task, next_p); + /* + * There's a problem with moving the arch_install_thread_hw_breakpoint() + * call before current is updated. Suppose a kernel breakpoint is + * triggered in between the two, the hw-breakpoint handler will see that + * the 'current' task does not have TIF_DEBUG flag set and will think it + * is leftover from an old task (lazy switching) and will erase it. Then + * until the next context switch, no user-breakpoints will be installed. + * + * The real problem is that it's impossible to update both current and + * physical debug registers at the same instant, so there will always be + * a window in which they disagree and a breakpoint might get triggered. + * Since we use lazy switching, we are forced to assume that a + * disagreement means that current is correct and the exception is due + * to lazy debug register switching. + */ + if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG))) + arch_install_thread_hw_breakpoint(next_p); return prev_p; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 5a1a1de292ec..f7b276d4b3fb 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -55,6 +55,8 @@ #include #include #include +#include +#include asmlinkage extern void ret_from_fork(void); @@ -248,6 +250,8 @@ void release_thread(struct task_struct *dead_task) BUG(); } } + if (unlikely(dead_task->thread.debugreg7)) + flush_thread_hw_breakpoint(dead_task); } static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) @@ -303,12 +307,18 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.fs = me->thread.fs; p->thread.gs = me->thread.gs; + p->thread.io_bitmap_ptr = NULL; savesegment(gs, p->thread.gsindex); savesegment(fs, p->thread.fsindex); savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); + err = -ENOMEM; + if (unlikely(test_tsk_thread_flag(me, TIF_DEBUG))) + if (copy_thread_hw_breakpoint(me, p, clone_flags)) + goto out; + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { @@ -347,6 +357,9 @@ out: kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } + if (err) + flush_thread_hw_breakpoint(p); + return err; } @@ -492,6 +505,24 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ if (tsk_used_math(next_p) && next_p->fpu_counter > 5) math_state_restore(); + /* + * There's a problem with moving the arch_install_thread_hw_breakpoint() + * call before current is updated. Suppose a kernel breakpoint is + * triggered in between the two, the hw-breakpoint handler will see that + * the 'current' task does not have TIF_DEBUG flag set and will think it + * is leftover from an old task (lazy switching) and will erase it. Then + * until the next context switch, no user-breakpoints will be installed. + * + * The real problem is that it's impossible to update both current and + * physical debug registers at the same instant, so there will always be + * a window in which they disagree and a breakpoint might get triggered. + * Since we use lazy switching, we are forced to assume that a + * disagreement means that current is correct and the exception is due + * to lazy debug register switching. + */ + if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG))) + arch_install_thread_hw_breakpoint(next_p); + return prev_p; } -- cgit v1.2.3 From da0cdc14f5f7e0faee6b2393fefed056cdb17146 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:45:03 +0530 Subject: hw-breakpoints: modify signal handling code to refrain from re-enabling HW Breakpoints This patch disables re-enabling of Hardware Breakpoint registers through the signal handling code. This is now done during from hw_breakpoint_handler(). Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/signal.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 14425166b8e3..f33d2e0ef095 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -800,15 +800,6 @@ static void do_signal(struct pt_regs *regs) signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { - /* - * Re-enable any watchpoints before delivering the - * signal to user space. The processor register will - * have been cleared if the watchpoint triggered - * inside the kernel. - */ - if (current->thread.debugreg7) - set_debugreg(current->thread.debugreg7, 7); - /* Whee! Actually deliver the signal. */ if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { /* -- cgit v1.2.3 From 72f674d203cd230426437cdcf7dd6f681dad8b0d Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:45:48 +0530 Subject: hw-breakpoints: modify Ptrace routines to access breakpoint registers This patch modifies the ptrace code to use the new wrapper routines around the debug/breakpoint registers. [ Impact: adapt x86 ptrace to the new breakpoint Api ] Original-patch-by: Alan Stern Signed-off-by: K.Prasad Signed-off-by: Maneesh Soni Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/ptrace.c | 231 +++++++++++++++++++++++++++++------------------ 1 file changed, 141 insertions(+), 90 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 313be40be55a..b457f78b7dbf 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -34,6 +34,7 @@ #include #include #include +#include #include @@ -136,11 +137,6 @@ static int set_segment_reg(struct task_struct *task, return 0; } -static unsigned long debugreg_addr_limit(struct task_struct *task) -{ - return TASK_SIZE - 3; -} - #else /* CONFIG_X86_64 */ #define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) @@ -265,15 +261,6 @@ static int set_segment_reg(struct task_struct *task, return 0; } -static unsigned long debugreg_addr_limit(struct task_struct *task) -{ -#ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(task, TIF_IA32)) - return IA32_PAGE_OFFSET - 3; -#endif - return TASK_SIZE_MAX - 7; -} - #endif /* CONFIG_X86_32 */ static unsigned long get_flags(struct task_struct *task) @@ -464,95 +451,159 @@ static int genregs_set(struct task_struct *target, } /* - * This function is trivial and will be inlined by the compiler. - * Having it separates the implementation details of debug - * registers from the interface details of ptrace. + * Decode the length and type bits for a particular breakpoint as + * stored in debug register 7. Return the "enabled" status. */ -static unsigned long ptrace_get_debugreg(struct task_struct *child, int n) +static int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, + unsigned *type) { - switch (n) { - case 0: return child->thread.debugreg[0]; - case 1: return child->thread.debugreg[1]; - case 2: return child->thread.debugreg[2]; - case 3: return child->thread.debugreg[3]; - case 6: return child->thread.debugreg6; - case 7: return child->thread.debugreg7; - } - return 0; + int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE); + + *len = (bp_info & 0xc) | 0x40; + *type = (bp_info & 0x3) | 0x80; + return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3; } -static int ptrace_set_debugreg(struct task_struct *child, - int n, unsigned long data) +static void ptrace_triggered(struct hw_breakpoint *bp, struct pt_regs *regs) { + struct thread_struct *thread = &(current->thread); int i; - if (unlikely(n == 4 || n == 5)) - return -EIO; + /* + * Store in the virtual DR6 register the fact that the breakpoint + * was hit so the thread's debugger will see it. + */ + for (i = 0; i < hbp_kernel_pos; i++) + /* + * We will check bp->info.address against the address stored in + * thread's hbp structure and not debugreg[i]. This is to ensure + * that the corresponding bit for 'i' in DR7 register is enabled + */ + if (bp->info.address == thread->hbp[i]->info.address) + break; - if (n < 4 && unlikely(data >= debugreg_addr_limit(child))) - return -EIO; + thread->debugreg6 |= (DR_TRAP0 << i); +} - switch (n) { - case 0: child->thread.debugreg[0] = data; break; - case 1: child->thread.debugreg[1] = data; break; - case 2: child->thread.debugreg[2] = data; break; - case 3: child->thread.debugreg[3] = data; break; +/* + * Handle ptrace writes to debug register 7. + */ +static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) +{ + struct thread_struct *thread = &(tsk->thread); + unsigned long old_dr7 = thread->debugreg7; + int i, orig_ret = 0, rc = 0; + int enabled, second_pass = 0; + unsigned len, type; + struct hw_breakpoint *bp; + + data &= ~DR_CONTROL_RESERVED; +restore: + /* + * Loop through all the hardware breakpoints, making the + * appropriate changes to each. + */ + for (i = 0; i < HBP_NUM; i++) { + enabled = decode_dr7(data, i, &len, &type); + bp = thread->hbp[i]; + + if (!enabled) { + if (bp) { + /* Don't unregister the breakpoints right-away, + * unless all register_user_hw_breakpoint() + * requests have succeeded. This prevents + * any window of opportunity for debug + * register grabbing by other users. + */ + if (!second_pass) + continue; + unregister_user_hw_breakpoint(tsk, bp); + kfree(bp); + } + continue; + } + if (!bp) { + rc = -ENOMEM; + bp = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL); + if (bp) { + bp->info.address = thread->debugreg[i]; + bp->triggered = ptrace_triggered; + bp->info.len = len; + bp->info.type = type; + rc = register_user_hw_breakpoint(tsk, bp); + if (rc) + kfree(bp); + } + } else + rc = modify_user_hw_breakpoint(tsk, bp); + if (rc) + break; + } + /* + * Make a second pass to free the remaining unused breakpoints + * or to restore the original breakpoints if an error occurred. + */ + if (!second_pass) { + second_pass = 1; + if (rc < 0) { + orig_ret = rc; + data = old_dr7; + } + goto restore; + } + return ((orig_ret < 0) ? orig_ret : rc); +} - case 6: - if ((data & ~0xffffffffUL) != 0) - return -EIO; - child->thread.debugreg6 = data; - break; +/* + * Handle PTRACE_PEEKUSR calls for the debug register area. + */ +unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) +{ + struct thread_struct *thread = &(tsk->thread); + unsigned long val = 0; + + if (n < HBP_NUM) + val = thread->debugreg[n]; + else if (n == 6) + val = thread->debugreg6; + else if (n == 7) + val = thread->debugreg7; + return val; +} - case 7: - /* - * Sanity-check data. Take one half-byte at once with - * check = (val >> (16 + 4*i)) & 0xf. It contains the - * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits - * 2 and 3 are LENi. Given a list of invalid values, - * we do mask |= 1 << invalid_value, so that - * (mask >> check) & 1 is a correct test for invalid - * values. - * - * R/Wi contains the type of the breakpoint / - * watchpoint, LENi contains the length of the watched - * data in the watchpoint case. - * - * The invalid values are: - * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit] - * - R/Wi == 0x10 (break on I/O reads or writes), so - * mask |= 0x4444. - * - R/Wi == 0x00 && LENi != 0x00, so we have mask |= - * 0x1110. - * - * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54. - * - * See the Intel Manual "System Programming Guide", - * 15.2.4 - * - * Note that LENi == 0x10 is defined on x86_64 in long - * mode (i.e. even for 32-bit userspace software, but - * 64-bit kernel), so the x86_64 mask value is 0x5454. - * See the AMD manual no. 24593 (AMD64 System Programming) - */ -#ifdef CONFIG_X86_32 -#define DR7_MASK 0x5f54 -#else -#define DR7_MASK 0x5554 -#endif - data &= ~DR_CONTROL_RESERVED; - for (i = 0; i < 4; i++) - if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1) - return -EIO; - child->thread.debugreg7 = data; - if (data) - set_tsk_thread_flag(child, TIF_DEBUG); - else - clear_tsk_thread_flag(child, TIF_DEBUG); - break; +/* + * Handle PTRACE_POKEUSR calls for the debug register area. + */ +int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) +{ + struct thread_struct *thread = &(tsk->thread); + int rc = 0; + + /* There are no DR4 or DR5 registers */ + if (n == 4 || n == 5) + return -EIO; + + if (n == 6) { + tsk->thread.debugreg6 = val; + goto ret_path; } + if (n < HBP_NUM) { + if (thread->hbp[n]) { + if (arch_check_va_in_userspace(val, + thread->hbp[n]->info.len) == 0) { + rc = -EIO; + goto ret_path; + } + thread->hbp[n]->info.address = val; + } + thread->debugreg[n] = val; + } + /* All that's left is DR7 */ + if (n == 7) + rc = ptrace_write_dr7(tsk, val); - return 0; +ret_path: + return rc; } /* -- cgit v1.2.3 From 17f557e5b5d43a2af66c969f6560ac7105020672 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:46:03 +0530 Subject: hw-breakpoints: cleanup HW Breakpoint registers before kexec This patch disables Hardware breakpoints before doing a 'kexec' on the machine so that the cpu doesn't keep debug registers values which would be out of sync for the new image. Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/machine_kexec_32.c | 2 ++ arch/x86/kernel/machine_kexec_64.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index c1c429d00130..c843f8406da2 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -25,6 +25,7 @@ #include #include #include +#include static void set_idt(void *newidt, __u16 limit) { @@ -202,6 +203,7 @@ void machine_kexec(struct kimage *image) /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + hw_breakpoint_disable(); if (image->preserve_context) { #ifdef CONFIG_X86_IO_APIC diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 84c3bf209e98..4a8bb82248ae 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -18,6 +18,7 @@ #include #include #include +#include static int init_one_level2_page(struct kimage *image, pgd_t *pgd, unsigned long addr) @@ -282,6 +283,7 @@ void machine_kexec(struct kimage *image) /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + hw_breakpoint_disable(); if (image->preserve_context) { #ifdef CONFIG_X86_IO_APIC -- cgit v1.2.3 From 432039933a16b8227b7b267f46ac1c1b9b3adf14 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:46:20 +0530 Subject: hw-breakpoints: sample HW breakpoint over kernel data address This patch introduces a sample kernel module to demonstrate the use of Hardware Breakpoint feature. It places a breakpoint over the kernel variable 'pid_max' to monitor all write operations and emits a function-backtrace when done. Signed-off-by: K.Prasad Signed-off-by: Frederic Weisbecker --- samples/Kconfig | 6 +++ samples/Makefile | 3 +- samples/hw_breakpoint/Makefile | 1 + samples/hw_breakpoint/data_breakpoint.c | 83 +++++++++++++++++++++++++++++++++ 4 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 samples/hw_breakpoint/Makefile create mode 100644 samples/hw_breakpoint/data_breakpoint.c diff --git a/samples/Kconfig b/samples/Kconfig index b75d28cba3f7..8458516c693c 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -45,5 +45,11 @@ config SAMPLE_KRETPROBES default m depends on SAMPLE_KPROBES && KRETPROBES +config SAMPLE_HW_BREAKPOINT + tristate "Build kernel hardware breakpoint examples -- loadable module only" + depends on HAVE_HW_BREAKPOINT && m + help + This builds kernel hardware breakpoint example modules. + endif # SAMPLES diff --git a/samples/Makefile b/samples/Makefile index 13e4b470b539..42e175598777 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -1,3 +1,4 @@ # Makefile for Linux samples code -obj-$(CONFIG_SAMPLES) += markers/ kobject/ kprobes/ tracepoints/ trace_events/ +obj-$(CONFIG_SAMPLES) += markers/ kobject/ kprobes/ tracepoints/ \ + trace_events/ hw_breakpoint/ diff --git a/samples/hw_breakpoint/Makefile b/samples/hw_breakpoint/Makefile new file mode 100644 index 000000000000..0f5c31c2fc47 --- /dev/null +++ b/samples/hw_breakpoint/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_SAMPLE_HW_BREAKPOINT) += data_breakpoint.o diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c new file mode 100644 index 000000000000..9cbdbb871b7a --- /dev/null +++ b/samples/hw_breakpoint/data_breakpoint.c @@ -0,0 +1,83 @@ +/* + * data_breakpoint.c - Sample HW Breakpoint file to watch kernel data address + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * usage: insmod data_breakpoint.ko ksym= + * + * This file is a kernel module that places a breakpoint over ksym_name kernel + * variable using Hardware Breakpoint register. The corresponding handler which + * prints a backtrace is invoked everytime a write operation is performed on + * that variable. + * + * Copyright (C) IBM Corporation, 2009 + */ +#include /* Needed by all modules */ +#include /* Needed for KERN_INFO */ +#include /* Needed for the macros */ + +#include + +struct hw_breakpoint sample_hbp; + +static char ksym_name[KSYM_NAME_LEN] = "pid_max"; +module_param_string(ksym, ksym_name, KSYM_NAME_LEN, S_IRUGO); +MODULE_PARM_DESC(ksym, "Kernel symbol to monitor; this module will report any" + " write operations on the kernel symbol"); + +void sample_hbp_handler(struct hw_breakpoint *temp, struct pt_regs + *temp_regs) +{ + printk(KERN_INFO "%s value is changed\n", ksym_name); + dump_stack(); + printk(KERN_INFO "Dump stack from sample_hbp_handler\n"); +} + +static int __init hw_break_module_init(void) +{ + int ret; + +#ifdef CONFIG_X86 + sample_hbp.info.name = ksym_name; + sample_hbp.info.type = HW_BREAKPOINT_WRITE; + sample_hbp.info.len = HW_BREAKPOINT_LEN_4; +#endif /* CONFIG_X86 */ + + sample_hbp.triggered = (void *)sample_hbp_handler; + + ret = register_kernel_hw_breakpoint(&sample_hbp); + + if (ret < 0) { + printk(KERN_INFO "Breakpoint registration failed\n"); + return ret; + } else + printk(KERN_INFO "HW Breakpoint for %s write installed\n", + ksym_name); + + return 0; +} + +static void __exit hw_break_module_exit(void) +{ + unregister_kernel_hw_breakpoint(&sample_hbp); + printk(KERN_INFO "HW Breakpoint for %s write uninstalled\n", ksym_name); +} + +module_init(hw_break_module_init); +module_exit(hw_break_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("K.Prasad"); +MODULE_DESCRIPTION("ksym breakpoint"); -- cgit v1.2.3 From 0722db015c246204044299eae3b02d18d3ca4faf Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:46:40 +0530 Subject: hw-breakpoints: ftrace plugin for kernel symbol tracing using HW Breakpoint interfaces This patch adds an ftrace plugin to detect and profile memory access over kernel variables. It uses HW Breakpoint interfaces to 'watch memory addresses. Signed-off-by: K.Prasad Signed-off-by: Frederic Weisbecker --- kernel/trace/Kconfig | 21 ++ kernel/trace/Makefile | 1 + kernel/trace/trace.h | 23 ++ kernel/trace/trace_ksym.c | 525 ++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_selftest.c | 53 +++++ 5 files changed, 623 insertions(+) create mode 100644 kernel/trace/trace_ksym.c diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a508b9d2adb8..d7f01e6e8ba5 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -314,6 +314,27 @@ config POWER_TRACER power management decisions, specifically the C-state and P-state behavior. +config KSYM_TRACER + bool "Trace read and write access on kernel memory locations" + depends on HAVE_HW_BREAKPOINT + select TRACING + help + This tracer helps find read and write operations on any given kernel + symbol i.e. /proc/kallsyms. + +config PROFILE_KSYM_TRACER + bool "Profile all kernel memory accesses on 'watched' variables" + depends on KSYM_TRACER + help + This tracer profiles kernel accesses on variables watched through the + ksym tracer ftrace plugin. Depending upon the hardware, all read + and write operations on kernel variables can be monitored for + accesses. + + The results will be displayed in: + /debugfs/tracing/profile_ksym + + Say N if unsure. config STACK_TRACER bool "Trace max stack" diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 06b85850fab4..658aace8c41e 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -51,5 +51,6 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o +obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6e735d4771f8..7d5cc37b8fca 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -15,6 +15,10 @@ #include #include +#ifdef CONFIG_KSYM_TRACER +#include +#endif + enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -40,6 +44,7 @@ enum trace_type { TRACE_KMEM_FREE, TRACE_POWER, TRACE_BLK, + TRACE_KSYM, __TRACE_LAST_TYPE, }; @@ -207,6 +212,21 @@ struct syscall_trace_exit { unsigned long ret; }; +#define KSYM_SELFTEST_ENTRY "ksym_selftest_dummy" +extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); + +struct trace_ksym { + struct trace_entry ent; + struct hw_breakpoint *ksym_hbp; + unsigned long ksym_addr; + unsigned long ip; +#ifdef CONFIG_PROFILE_KSYM_TRACER + unsigned long counter; +#endif + struct hlist_node ksym_hlist; + char ksym_name[KSYM_NAME_LEN]; + char p_name[TASK_COMM_LEN]; +}; /* * trace_flag_type is an enumeration that holds different @@ -323,6 +343,7 @@ extern void __ftrace_bad_type(void); TRACE_SYSCALL_ENTER); \ IF_ASSIGN(var, ent, struct syscall_trace_exit, \ TRACE_SYSCALL_EXIT); \ + IF_ASSIGN(var, ent, struct trace_ksym, TRACE_KSYM); \ __ftrace_bad_type(); \ } while (0) @@ -540,6 +561,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_hw_branches(struct tracer *trace, struct trace_array *tr); +extern int trace_selftest_startup_ksym(struct tracer *trace, + struct trace_array *tr); #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c new file mode 100644 index 000000000000..11c74f6404cc --- /dev/null +++ b/kernel/trace/trace_ksym.c @@ -0,0 +1,525 @@ +/* + * trace_ksym.c - Kernel Symbol Tracer + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2009 + */ + +#include +#include +#include +#include +#include +#include + +#include "trace_output.h" +#include "trace_stat.h" +#include "trace.h" + +/* For now, let us restrict the no. of symbols traced simultaneously to number + * of available hardware breakpoint registers. + */ +#define KSYM_TRACER_MAX HBP_NUM + +#define KSYM_TRACER_OP_LEN 3 /* rw- */ +#define KSYM_FILTER_ENTRY_LEN (KSYM_NAME_LEN + KSYM_TRACER_OP_LEN + 1) + +static struct trace_array *ksym_trace_array; + +static unsigned int ksym_filter_entry_count; +static unsigned int ksym_tracing_enabled; + +static HLIST_HEAD(ksym_filter_head); + +#ifdef CONFIG_PROFILE_KSYM_TRACER + +#define MAX_UL_INT 0xffffffff + +static DEFINE_MUTEX(ksym_tracer_mutex); + +void ksym_collect_stats(unsigned long hbp_hit_addr) +{ + struct hlist_node *node; + struct trace_ksym *entry; + + rcu_read_lock(); + hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { + if ((entry->ksym_addr == hbp_hit_addr) && + (entry->counter <= MAX_UL_INT)) { + entry->counter++; + break; + } + } + rcu_read_unlock(); +} +#endif /* CONFIG_PROFILE_KSYM_TRACER */ + +void ksym_hbp_handler(struct hw_breakpoint *hbp, struct pt_regs *regs) +{ + struct ring_buffer_event *event; + struct trace_array *tr; + struct trace_ksym *entry; + int pc; + + if (!ksym_tracing_enabled) + return; + + tr = ksym_trace_array; + pc = preempt_count(); + + event = trace_buffer_lock_reserve(tr, TRACE_KSYM, + sizeof(*entry), 0, pc); + if (!event) + return; + + entry = ring_buffer_event_data(event); + strlcpy(entry->ksym_name, hbp->info.name, KSYM_SYMBOL_LEN); + entry->ksym_hbp = hbp; + entry->ip = instruction_pointer(regs); + strlcpy(entry->p_name, current->comm, TASK_COMM_LEN); +#ifdef CONFIG_PROFILE_KSYM_TRACER + ksym_collect_stats(hbp->info.address); +#endif /* CONFIG_PROFILE_KSYM_TRACER */ + + trace_buffer_unlock_commit(tr, event, 0, pc); +} + +/* Valid access types are represented as + * + * rw- : Set Read/Write Access Breakpoint + * -w- : Set Write Access Breakpoint + * --- : Clear Breakpoints + * --x : Set Execution Break points (Not available yet) + * + */ +static int ksym_trace_get_access_type(char *access_str) +{ + int pos, access = 0; + + for (pos = 0; pos < KSYM_TRACER_OP_LEN; pos++) { + switch (access_str[pos]) { + case 'r': + access += (pos == 0) ? 4 : -1; + break; + case 'w': + access += (pos == 1) ? 2 : -1; + break; + case '-': + break; + default: + return -EINVAL; + } + } + + switch (access) { + case 6: + access = HW_BREAKPOINT_RW; + break; + case 2: + access = HW_BREAKPOINT_WRITE; + break; + case 0: + access = 0; + } + + return access; +} + +/* + * There can be several possible malformed requests and we attempt to capture + * all of them. We enumerate some of the rules + * 1. We will not allow kernel symbols with ':' since it is used as a delimiter. + * i.e. multiple ':' symbols disallowed. Possible uses are of the form + * ::. + * 2. No delimiter symbol ':' in the input string + * 3. Spurious operator symbols or symbols not in their respective positions + * 4. :--- i.e. clear breakpoint request when ksym_name not in file + * 5. Kernel symbol not a part of /proc/kallsyms + * 6. Duplicate requests + */ +static int parse_ksym_trace_str(char *input_string, char **ksymname, + unsigned long *addr) +{ + char *delimiter = ":"; + int ret; + + ret = -EINVAL; + *ksymname = strsep(&input_string, delimiter); + *addr = kallsyms_lookup_name(*ksymname); + + /* Check for malformed request: (2), (1) and (5) */ + if ((!input_string) || + (strlen(input_string) != (KSYM_TRACER_OP_LEN + 1)) || + (*addr == 0)) + goto return_code; + ret = ksym_trace_get_access_type(input_string); + +return_code: + return ret; +} + +int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) +{ + struct trace_ksym *entry; + int ret; + + if (ksym_filter_entry_count >= KSYM_TRACER_MAX) { + printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No" + " new requests for tracing can be accepted now.\n", + KSYM_TRACER_MAX); + return -ENOSPC; + } + + entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + entry->ksym_hbp = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL); + if (!entry->ksym_hbp) { + kfree(entry); + return -ENOMEM; + } + + entry->ksym_hbp->info.name = ksymname; + entry->ksym_hbp->info.type = op; + entry->ksym_addr = entry->ksym_hbp->info.address = addr; +#ifdef CONFIG_X86 + entry->ksym_hbp->info.len = HW_BREAKPOINT_LEN_4; +#endif + entry->ksym_hbp->triggered = (void *)ksym_hbp_handler; + + ret = register_kernel_hw_breakpoint(entry->ksym_hbp); + if (ret < 0) { + printk(KERN_INFO "ksym_tracer request failed. Try again" + " later!!\n"); + kfree(entry->ksym_hbp); + kfree(entry); + return -EAGAIN; + } + hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head); + ksym_filter_entry_count++; + + return 0; +} + +static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct trace_ksym *entry; + struct hlist_node *node; + char buf[KSYM_FILTER_ENTRY_LEN * KSYM_TRACER_MAX]; + ssize_t ret, cnt = 0; + + mutex_lock(&ksym_tracer_mutex); + + hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { + cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt, "%s:", + entry->ksym_hbp->info.name); + if (entry->ksym_hbp->info.type == HW_BREAKPOINT_WRITE) + cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt, + "-w-\n"); + else if (entry->ksym_hbp->info.type == HW_BREAKPOINT_RW) + cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt, + "rw-\n"); + } + ret = simple_read_from_buffer(ubuf, count, ppos, buf, strlen(buf)); + mutex_unlock(&ksym_tracer_mutex); + + return ret; +} + +static ssize_t ksym_trace_filter_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + struct trace_ksym *entry; + struct hlist_node *node; + char *input_string, *ksymname = NULL; + unsigned long ksym_addr = 0; + int ret, op, changed = 0; + + /* Ignore echo "" > ksym_trace_filter */ + if (count == 0) + return 0; + + input_string = kzalloc(count, GFP_KERNEL); + if (!input_string) + return -ENOMEM; + + if (copy_from_user(input_string, buffer, count)) { + kfree(input_string); + return -EFAULT; + } + + ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr); + if (ret < 0) { + kfree(input_string); + return ret; + } + + mutex_lock(&ksym_tracer_mutex); + + ret = -EINVAL; + hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { + if (entry->ksym_addr == ksym_addr) { + /* Check for malformed request: (6) */ + if (entry->ksym_hbp->info.type != op) + changed = 1; + else + goto err_ret; + break; + } + } + if (changed) { + unregister_kernel_hw_breakpoint(entry->ksym_hbp); + entry->ksym_hbp->info.type = op; + if (op > 0) { + ret = register_kernel_hw_breakpoint(entry->ksym_hbp); + if (ret == 0) { + ret = count; + goto unlock_ret_path; + } + } + ksym_filter_entry_count--; + hlist_del_rcu(&(entry->ksym_hlist)); + synchronize_rcu(); + kfree(entry->ksym_hbp); + kfree(entry); + ret = count; + goto err_ret; + } else { + /* Check for malformed request: (4) */ + if (op == 0) + goto err_ret; + ret = process_new_ksym_entry(ksymname, op, ksym_addr); + if (ret) + goto err_ret; + } + ret = count; + goto unlock_ret_path; + +err_ret: + kfree(input_string); + +unlock_ret_path: + mutex_unlock(&ksym_tracer_mutex); + return ret; +} + +static const struct file_operations ksym_tracing_fops = { + .open = tracing_open_generic, + .read = ksym_trace_filter_read, + .write = ksym_trace_filter_write, +}; + +static void ksym_trace_reset(struct trace_array *tr) +{ + struct trace_ksym *entry; + struct hlist_node *node, *node1; + + ksym_tracing_enabled = 0; + + mutex_lock(&ksym_tracer_mutex); + hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head, + ksym_hlist) { + unregister_kernel_hw_breakpoint(entry->ksym_hbp); + ksym_filter_entry_count--; + hlist_del_rcu(&(entry->ksym_hlist)); + synchronize_rcu(); + /* Free the 'input_string' only if reset + * after startup self-test + */ +#ifdef CONFIG_FTRACE_SELFTEST + if (strncmp(entry->ksym_hbp->info.name, KSYM_SELFTEST_ENTRY, + strlen(KSYM_SELFTEST_ENTRY)) != 0) +#endif /* CONFIG_FTRACE_SELFTEST*/ + kfree(entry->ksym_hbp->info.name); + kfree(entry->ksym_hbp); + kfree(entry); + } + mutex_unlock(&ksym_tracer_mutex); +} + +static int ksym_trace_init(struct trace_array *tr) +{ + int cpu, ret = 0; + + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); + ksym_tracing_enabled = 1; + ksym_trace_array = tr; + + return ret; +} + +static void ksym_trace_print_header(struct seq_file *m) +{ + + seq_puts(m, + "# TASK-PID CPU# Symbol Type " + "Function \n"); + seq_puts(m, + "# | | | | " + "| \n"); +} + +static enum print_line_t ksym_trace_output(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct trace_ksym *field; + char str[KSYM_SYMBOL_LEN]; + int ret; + + if (entry->type != TRACE_KSYM) + return TRACE_TYPE_UNHANDLED; + + trace_assign_type(field, entry); + + ret = trace_seq_printf(s, "%-15s %-5d %-3d %-20s ", field->p_name, + entry->pid, iter->cpu, field->ksym_name); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + switch (field->ksym_hbp->info.type) { + case HW_BREAKPOINT_WRITE: + ret = trace_seq_printf(s, " W "); + break; + case HW_BREAKPOINT_RW: + ret = trace_seq_printf(s, " RW "); + break; + default: + return TRACE_TYPE_PARTIAL_LINE; + } + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + sprint_symbol(str, field->ip); + ret = trace_seq_printf(s, "%-20s\n", str); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +struct tracer ksym_tracer __read_mostly = +{ + .name = "ksym_tracer", + .init = ksym_trace_init, + .reset = ksym_trace_reset, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_ksym, +#endif + .print_header = ksym_trace_print_header, + .print_line = ksym_trace_output +}; + +__init static int init_ksym_trace(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + ksym_filter_entry_count = 0; + + entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer, + NULL, &ksym_tracing_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'ksym_trace_filter' file\n"); + + return register_tracer(&ksym_tracer); +} +device_initcall(init_ksym_trace); + + +#ifdef CONFIG_PROFILE_KSYM_TRACER +static int ksym_tracer_stat_headers(struct seq_file *m) +{ + seq_printf(m, " Access type "); + seq_printf(m, " Symbol Counter \n"); + return 0; +} + +static int ksym_tracer_stat_show(struct seq_file *m, void *v) +{ + struct hlist_node *stat = v; + struct trace_ksym *entry; + int access_type = 0; + char fn_name[KSYM_NAME_LEN]; + + entry = hlist_entry(stat, struct trace_ksym, ksym_hlist); + + if (entry->ksym_hbp) + access_type = entry->ksym_hbp->info.type; + + switch (access_type) { + case HW_BREAKPOINT_WRITE: + seq_printf(m, " W "); + break; + case HW_BREAKPOINT_RW: + seq_printf(m, " RW "); + break; + default: + seq_printf(m, " NA "); + } + + if (lookup_symbol_name(entry->ksym_addr, fn_name) >= 0) + seq_printf(m, " %s ", fn_name); + else + seq_printf(m, " "); + + seq_printf(m, "%15lu\n", entry->counter); + return 0; +} + +static void *ksym_tracer_stat_start(struct tracer_stat *trace) +{ + return &(ksym_filter_head.first); +} + +static void * +ksym_tracer_stat_next(void *v, int idx) +{ + struct hlist_node *stat = v; + + return stat->next; +} + +static struct tracer_stat ksym_tracer_stats = { + .name = "ksym_tracer", + .stat_start = ksym_tracer_stat_start, + .stat_next = ksym_tracer_stat_next, + .stat_headers = ksym_tracer_stat_headers, + .stat_show = ksym_tracer_stat_show +}; + +__init static int ksym_tracer_stat_init(void) +{ + int ret; + + ret = register_stat_tracer(&ksym_tracer_stats); + if (ret) { + printk(KERN_WARNING "Warning: could not register " + "ksym tracer stats\n"); + return 1; + } + + return 0; +} +fs_initcall(ksym_tracer_stat_init); +#endif /* CONFIG_PROFILE_KSYM_TRACER */ diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 00dd6485bdd7..71f2edb0fd84 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_GRAPH_ENT: case TRACE_GRAPH_RET: case TRACE_HW_BRANCHES: + case TRACE_KSYM: return 1; } return 0; @@ -807,3 +808,55 @@ trace_selftest_startup_hw_branches(struct tracer *trace, return ret; } #endif /* CONFIG_HW_BRANCH_TRACER */ + +#ifdef CONFIG_KSYM_TRACER +static int ksym_selftest_dummy; + +int +trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + + /* start the tracing */ + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + ksym_selftest_dummy = 0; + /* Register the read-write tracing request */ + ret = process_new_ksym_entry(KSYM_SELFTEST_ENTRY, HW_BREAKPOINT_RW, + (unsigned long)(&ksym_selftest_dummy)); + + if (ret < 0) { + printk(KERN_CONT "ksym_trace read-write startup test failed\n"); + goto ret_path; + } + /* Perform a read and a write operation over the dummy variable to + * trigger the tracer + */ + if (ksym_selftest_dummy == 0) + ksym_selftest_dummy++; + + /* stop the tracing. */ + tracing_stop(); + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + tracing_start(); + + /* read & write operations - one each is performed on the dummy variable + * triggering two entries in the trace buffer + */ + if (!ret && count != 2) { + printk(KERN_CONT "Ksym tracer startup test failed"); + ret = -1; + } + +ret_path: + return ret; +} +#endif /* CONFIG_KSYM_TRACER */ + -- cgit v1.2.3 From 62edab9056a6cf0c9207339c8892c923a5217e45 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:47:06 +0530 Subject: hw-breakpoints: reset bits in dr6 after the corresponding exception is handled This patch resets the bit in dr6 after the corresponding exception is handled in code, so that we keep a clean track of the current virtual debug status register. [ Impact: keep track of breakpoints triggering completion ] Signed-off-by: K.Prasad Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/hw_breakpoint.c | 13 +++++++++++-- arch/x86/kernel/kgdb.c | 6 ++++++ arch/x86/kernel/kprobes.c | 9 ++++++++- arch/x86/kernel/traps.c | 4 ++-- arch/x86/mm/kmmio.c | 8 +++++++- 5 files changed, 34 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 4867c9f3b5fb..69451473dbd2 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -314,8 +314,12 @@ int __kprobes hw_breakpoint_handler(struct die_args *args) { int i, cpu, rc = NOTIFY_STOP; struct hw_breakpoint *bp; - /* The DR6 value is stored in args->err */ - unsigned long dr7, dr6 = args->err; + unsigned long dr7, dr6; + unsigned long *dr6_p; + + /* The DR6 value is pointed by args->err */ + dr6_p = (unsigned long *)ERR_PTR(args->err); + dr6 = *dr6_p; /* Do an early return if no trap bits are set in DR6 */ if ((dr6 & DR_TRAP_BITS) == 0) @@ -351,6 +355,11 @@ int __kprobes hw_breakpoint_handler(struct die_args *args) if (bp) rc = NOTIFY_DONE; } + /* + * Reset the 'i'th TRAP bit in dr6 to denote completion of + * exception handling + */ + (*dr6_p) &= ~(DR_TRAP0 << i); /* * bp can be NULL due to lazy debug register switching * or due to the delay between updates of hbp_kernel_pos diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index b1f4dffb919e..f820b73c7f28 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -43,6 +43,7 @@ #include #include +#include #include #include @@ -434,6 +435,11 @@ single_step_cont(struct pt_regs *regs, struct die_args *args) "resuming...\n"); kgdb_arch_handle_exception(args->trapnr, args->signr, args->err, "c", "", regs); + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP; return NOTIFY_STOP; } diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 7b5169d2b000..b5b1848c5336 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -54,6 +54,7 @@ #include #include #include +#include void jprobe_return_end(void); @@ -967,8 +968,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, ret = NOTIFY_STOP; break; case DIE_DEBUG: - if (post_kprobe_handler(args->regs)) + if (post_kprobe_handler(args->regs)) { + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP; ret = NOTIFY_STOP; + } break; case DIE_GPF: /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index de9913247dd0..124a4d5a95b2 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -545,8 +545,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) /* Store the virtualized DR6 value */ tsk->thread.debugreg6 = dr6; - if (notify_die(DIE_DEBUG, "debug", regs, dr6, error_code, - SIGTRAP) == NOTIFY_STOP) + if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code, + SIGTRAP) == NOTIFY_STOP) return; /* It's safe to allow irq's after DR6 has been saved */ diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 16ccbd77917f..11a4ad4d6253 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -540,8 +540,14 @@ kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args) struct die_args *arg = args; if (val == DIE_DEBUG && (arg->err & DR_STEP)) - if (post_kmmio_handler(arg->err, arg->regs) == 1) + if (post_kmmio_handler(arg->err, arg->regs) == 1) { + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + (*(unsigned long *)ERR_PTR(arg->err)) &= ~DR_STEP; return NOTIFY_STOP; + } return NOTIFY_DONE; } -- cgit v1.2.3 From 73874005cd8800440be4299bd095387fff4b90ac Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 3 Jun 2009 01:43:38 +0200 Subject: hw-breakpoints: fix undeclared ksym_tracer_mutex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ksym_tracer_mutex is declared inside an #ifdef CONFIG_PROFILE_KSYM_TRACER section. This makes it unavailable for the hardware breakpoint tracer if it is configured without the breakpoint profiler. This patch fixes the following build error: kernel/trace/trace_ksym.c: In function ‘ksym_trace_filter_read’: kernel/trace/trace_ksym.c:226: erreur: ‘ksym_tracer_mutex’ undeclared (first use in this function) kernel/trace/trace_ksym.c:226: erreur: (Each undeclared identifier is reported only once kernel/trace/trace_ksym.c:226: erreur: for each function it appears in.) kernel/trace/trace_ksym.c: In function ‘ksym_trace_filter_write’: kernel/trace/trace_ksym.c:273: erreur: ‘ksym_tracer_mutex’ undeclared (first use in this function) kernel/trace/trace_ksym.c: In function ‘ksym_trace_reset’: kernel/trace/trace_ksym.c:335: erreur: ‘ksym_tracer_mutex’ undeclared (first use in this function) make[1]: *** [kernel/trace/trace_ksym.o] Erreur 1 [ Impact: fix a build error ] Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_ksym.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index 11c74f6404cc..eef97e7c8db7 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -44,12 +44,12 @@ static unsigned int ksym_tracing_enabled; static HLIST_HEAD(ksym_filter_head); +static DEFINE_MUTEX(ksym_tracer_mutex); + #ifdef CONFIG_PROFILE_KSYM_TRACER #define MAX_UL_INT 0xffffffff -static DEFINE_MUTEX(ksym_tracer_mutex); - void ksym_collect_stats(unsigned long hbp_hit_addr) { struct hlist_node *node; -- cgit v1.2.3 From 9d9949ce45b42be7a8157ea00fff2b1005ed844a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 4 Jun 2009 00:55:45 -0400 Subject: tracing/function-profiler: do not free per cpu variable stat The per cpu variable stat is freeded if we fail to allocate a name on start up. This was due to stat at first being allocated in the initial design. But since then, it has become a static per cpu variable but the free on error was not removed. Also added __init annotation to the function that this is in. [ Impact: prevent possible memory corruption on low mem at boot up ] Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d6973dfadb36..134e580ff251 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -766,7 +766,7 @@ static struct tracer_stat function_stats __initdata = { .stat_show = function_stat_show }; -static void ftrace_profile_debugfs(struct dentry *d_tracer) +static __init void ftrace_profile_debugfs(struct dentry *d_tracer) { struct ftrace_profile_stat *stat; struct dentry *entry; @@ -784,7 +784,6 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer) * The files created are permanent, if something happens * we still do not free memory. */ - kfree(stat); WARN(1, "Could not allocate stat file for cpu %d\n", cpu); @@ -811,7 +810,7 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer) } #else /* CONFIG_FUNCTION_PROFILER */ -static void ftrace_profile_debugfs(struct dentry *d_tracer) +static __init void ftrace_profile_debugfs(struct dentry *d_tracer) { } #endif /* CONFIG_FUNCTION_PROFILER */ -- cgit v1.2.3 From a96110b7ab3cebecf11dac208361feadb66eabfd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 4 Jun 2009 00:59:48 -0400 Subject: tracing/branch-profiler: fix return value check on register_stat_tracer The branch profiler incorrectly takes a zero return on register_stat_tracer as an error. But zero means success. [ Impact: remove annoying false warning on boot up ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_branch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 7a7a9fd249a9..19c618ff0b38 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -379,7 +379,7 @@ __init static int all_annotated_branch_stats(void) int ret; ret = register_stat_tracer(&all_branch_stats); - if (!ret) { + if (ret) { printk(KERN_WARNING "Warning: could not register " "all branches stats\n"); return 1; -- cgit v1.2.3 From d2de4151121c209ac77c07495db872b6d36e7c30 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 3 Jun 2009 13:04:18 -0400 Subject: tracing/branch-profiler: do not profile managers Currently the branch profiler does not handle modules. Do not bother processing the code for them. Signed-off-by: Steven Rostedt --- include/linux/compiler.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 37bcb50a4d7c..dd422945c83f 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -75,9 +75,11 @@ struct ftrace_branch_data { /* * Note: DISABLE_BRANCH_PROFILING can be used by special lowlevel code * to disable branch tracing on a per file basis. + * We currently do not profile modules. */ -#if defined(CONFIG_TRACE_BRANCH_PROFILING) \ - && !defined(DISABLE_BRANCH_PROFILING) && !defined(__CHECKER__) +#if defined(CONFIG_TRACE_BRANCH_PROFILING) \ + && !defined(DISABLE_BRANCH_PROFILING) && !defined(__CHECKER__) \ + && !defined(MODULE) void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); #define likely_notrace(x) __builtin_expect(!!(x), 1) -- cgit v1.2.3 From 4415a5d50ef9dcae5c681c43d5ca18e77e29677b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 3 Jun 2009 13:50:25 -0400 Subject: tracing/branch-profiler: move macro defines to kernel.h The branch profiler should not profile boot up code in the archs. Moving it from compiler.h to kernel.h brings down its use. This movement means we can also reference functions and variables from the macros. Signed-off-by: Steven Rostedt --- include/linux/compiler.h | 87 ++---------------------------------------------- include/linux/kernel.h | 81 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 85 deletions(-) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index dd422945c83f..24b35c6d2f03 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -55,91 +55,8 @@ extern void __chk_io_ptr(const volatile void __iomem *); * specific implementations come from the above header files */ -struct ftrace_branch_data { - const char *func; - const char *file; - unsigned line; - union { - struct { - unsigned long correct; - unsigned long incorrect; - }; - struct { - unsigned long miss; - unsigned long hit; - }; - unsigned long miss_hit[2]; - }; -}; - -/* - * Note: DISABLE_BRANCH_PROFILING can be used by special lowlevel code - * to disable branch tracing on a per file basis. - * We currently do not profile modules. - */ -#if defined(CONFIG_TRACE_BRANCH_PROFILING) \ - && !defined(DISABLE_BRANCH_PROFILING) && !defined(__CHECKER__) \ - && !defined(MODULE) -void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); - -#define likely_notrace(x) __builtin_expect(!!(x), 1) -#define unlikely_notrace(x) __builtin_expect(!!(x), 0) - -#define __branch_check__(x, expect) ({ \ - int ______r; \ - static struct ftrace_branch_data \ - __attribute__((__aligned__(4))) \ - __attribute__((section("_ftrace_annotated_branch"))) \ - ______f = { \ - .func = __func__, \ - .file = __FILE__, \ - .line = __LINE__, \ - }; \ - ______r = likely_notrace(x); \ - ftrace_likely_update(&______f, ______r, expect); \ - ______r; \ - }) - -/* - * Using __builtin_constant_p(x) to ignore cases where the return - * value is always the same. This idea is taken from a similar patch - * written by Daniel Walker. - */ -# ifndef likely -# define likely(x) (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 1)) -# endif -# ifndef unlikely -# define unlikely(x) (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 0)) -# endif - -#ifdef CONFIG_PROFILE_ALL_BRANCHES -/* - * "Define 'is'", Bill Clinton - * "Define 'if'", Steven Rostedt - */ -#define if(cond, ...) __trace_if( (cond , ## __VA_ARGS__) ) -#define __trace_if(cond) \ - if (__builtin_constant_p((cond)) ? !!(cond) : \ - ({ \ - int ______r; \ - static struct ftrace_branch_data \ - __attribute__((__aligned__(4))) \ - __attribute__((section("_ftrace_branch"))) \ - ______f = { \ - .func = __func__, \ - .file = __FILE__, \ - .line = __LINE__, \ - }; \ - ______r = !!(cond); \ - ______f.miss_hit[______r]++; \ - ______r; \ - })) -#endif /* CONFIG_PROFILE_ALL_BRANCHES */ - -#else -# define likely(x) __builtin_expect(!!(x), 1) -# define unlikely(x) __builtin_expect(!!(x), 0) -#endif +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) /* Optimization barrier */ #ifndef barrier diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 883cd44ff765..f96a48188aeb 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -20,6 +20,87 @@ #include #include + +struct ftrace_branch_data { + const char *func; + const char *file; + unsigned line; + union { + struct { + unsigned long correct; + unsigned long incorrect; + }; + struct { + unsigned long miss; + unsigned long hit; + }; + unsigned long miss_hit[2]; + }; +}; + +/* + * Note: DISABLE_BRANCH_PROFILING can be used by special lowlevel code + * to disable branch tracing on a per file basis. + * We currently do not profile modules. + */ +#if defined(CONFIG_TRACE_BRANCH_PROFILING) \ + && !defined(DISABLE_BRANCH_PROFILING) && !defined(__CHECKER__) \ + && !defined(MODULE) +void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); + +#define likely_notrace(x) __builtin_expect(!!(x), 1) +#define unlikely_notrace(x) __builtin_expect(!!(x), 0) + +#define __branch_check__(x, expect) ({ \ + int ______r; \ + static struct ftrace_branch_data \ + __attribute__((__aligned__(4))) \ + __attribute__((section("_ftrace_annotated_branch"))) \ + ______f = { \ + .func = __func__, \ + .file = __FILE__, \ + .line = __LINE__, \ + }; \ + ______r = likely_notrace(x); \ + ftrace_likely_update(&______f, ______r, expect); \ + ______r; \ + }) + +/* + * Using __builtin_constant_p(x) to ignore cases where the return + * value is always the same. This idea is taken from a similar patch + * written by Daniel Walker. + */ +#undef likely +#define likely(x) (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 1)) +#undef unlikely +#define unlikely(x) (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 0)) + +#ifdef CONFIG_PROFILE_ALL_BRANCHES +/* + * "Define 'is'", Bill Clinton + * "Define 'if'", Steven Rostedt + */ +#define if(cond, ...) __trace_if( (cond , ## __VA_ARGS__) ) +#define __trace_if(cond) \ + if (__builtin_constant_p((cond)) ? !!(cond) : \ + ({ \ + int ______r; \ + static struct ftrace_branch_data \ + __attribute__((__aligned__(4))) \ + __attribute__((section("_ftrace_branch"))) \ + ______f = { \ + .func = __func__, \ + .file = __FILE__, \ + .line = __LINE__, \ + }; \ + ______r = !!(cond); \ + ______f.miss_hit[______r]++; \ + ______r; \ + })) +#endif /* CONFIG_PROFILE_ALL_BRANCHES */ +#endif + extern const char linux_banner[]; extern const char linux_proc_banner[]; -- cgit v1.2.3 From 7148bdf8c661b685430ec7934b91dcd9856d258e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 3 Jun 2009 15:35:52 -0400 Subject: tracing/branch-profiling: add variable to disable branch profiling Currently, the branch profiler is either always enabled, or always disabled. This patch adds a sysctl (/proc/sys/kernel/branch_profiling_enabled) to be able to enable/disable the branch profiler. This should help on cache line bouncing when not enabled. Signed-off-by: Steven Rostedt --- include/linux/kernel.h | 4 +++- kernel/sysctl.c | 10 ++++++++++ kernel/trace/trace_branch.c | 9 +++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index f96a48188aeb..9372a8765e4b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -77,13 +77,15 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); #define unlikely(x) (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 0)) #ifdef CONFIG_PROFILE_ALL_BRANCHES +extern int sysctl_branch_profiling_enabled; /* * "Define 'is'", Bill Clinton * "Define 'if'", Steven Rostedt */ #define if(cond, ...) __trace_if( (cond , ## __VA_ARGS__) ) #define __trace_if(cond) \ - if (__builtin_constant_p((cond)) ? !!(cond) : \ + if ((!sysctl_branch_profiling_enabled || \ + __builtin_constant_p((cond))) ? !!(cond) : \ ({ \ int ______r; \ static struct ftrace_branch_data \ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b2970d56fb76..2628aa75b1c7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -912,6 +912,16 @@ static struct ctl_table kern_table[] = { .child = slow_work_sysctls, }, #endif +#ifdef CONFIG_PROFILE_ALL_BRANCHES + { + .ctl_name = CTL_UNNUMBERED, + .procname = "branch_profiling_enabled", + .data = &sysctl_branch_profiling_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 19c618ff0b38..e4f146579850 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -334,6 +334,15 @@ fs_initcall(init_annotated_branch_stats); #ifdef CONFIG_PROFILE_ALL_BRANCHES +int sysctl_branch_profiling_enabled __read_mostly; + +static int __init set_enable_branch_profiler(char *str) +{ + sysctl_branch_profiling_enabled = 1; + return 1; +} +__setup("enable_branch_profiler", set_enable_branch_profiler); + extern unsigned long __start_branch_profile[]; extern unsigned long __stop_branch_profile[]; -- cgit v1.2.3 From e482f8395f215e0ad6557b2722cd9b9b308035c4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 4 Jun 2009 00:08:06 -0400 Subject: tracing/branch-profiler: add option to profile branches per cpu The branch profiler increments a variable at every branch, saying whether the branch was taken or not. The problem with this is that on large CPU machines, this can cause severe cacheline bouncing. A way around this is to have per cpu tables. But having per cpu buffers can also be an issue. The branch profiler requires a table of all branches, which can be rather large (looking at my current image, it is 2.2 megs). For machines that have only 2 to 4 CPUs, the cache line bouncing may not be that much of an issue. This will needlessly double the amount of memory needed. This patch creates an option that allows the user to enable or disable percpu table recording of the branch profiler. Signed-off-by: Steven Rostedt --- include/asm-generic/vmlinux.lds.h | 16 ++++++++-- include/linux/kernel.h | 11 ++++++- kernel/trace/Kconfig | 13 ++++++++ kernel/trace/trace_branch.c | 62 +++++++++++++++++++++++++++++++++++++-- 4 files changed, 96 insertions(+), 6 deletions(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index f1736ca7922c..ddc78c3bd032 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -56,11 +56,19 @@ #endif #ifdef CONFIG_PROFILE_ALL_BRANCHES -#define BRANCH_PROFILE() VMLINUX_SYMBOL(__start_branch_profile) = .; \ +#define _BRANCH_PROFILE() VMLINUX_SYMBOL(__start_branch_profile) = .; \ *(_ftrace_branch) \ VMLINUX_SYMBOL(__stop_branch_profile) = .; #else -#define BRANCH_PROFILE() +#define _BRANCH_PROFILE() +#endif + +#ifdef CONFIG_PROFILE_BRANCHES_PER_CPU +# define BRANCH_PROFILE() +# define BRANCH_PER_CPU_PROFILE() _BRANCH_PROFILE() +#else +# define BRANCH_PROFILE() _BRANCH_PROFILE() +# define BRANCH_PER_CPU_PROFILE() #endif #ifdef CONFIG_EVENT_TRACING @@ -111,7 +119,7 @@ *(__verbose) \ VMLINUX_SYMBOL(__stop___verbose) = .; \ LIKELY_PROFILE() \ - BRANCH_PROFILE() \ + BRANCH_PROFILE() \ TRACE_PRINTKS() \ FTRACE_EVENTS() \ TRACE_SYSCALLS() @@ -481,6 +489,7 @@ *(.data.percpu.page_aligned) \ *(.data.percpu) \ *(.data.percpu.shared_aligned) \ + BRANCH_PER_CPU_PROFILE() \ VMLINUX_SYMBOL(__per_cpu_end) = .; \ } phdr \ . = VMLINUX_SYMBOL(__per_cpu_load) + SIZEOF(.data.percpu); @@ -507,5 +516,6 @@ *(.data.percpu.page_aligned) \ *(.data.percpu) \ *(.data.percpu.shared_aligned) \ + BRANCH_PER_CPU_PROFILE() \ VMLINUX_SYMBOL(__per_cpu_end) = .; \ } diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 9372a8765e4b..ef821ffaec57 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -78,6 +78,15 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); #ifdef CONFIG_PROFILE_ALL_BRANCHES extern int sysctl_branch_profiling_enabled; +#ifdef CONFIG_PROFILE_BRANCHES_PER_CPU +extern void branch_profiler(struct ftrace_branch_data *data, int cond); +#else +static inline void branch_profiler(struct ftrace_branch_data *data, int cond) +{ + data->miss_hit[cond]++; +} +#endif + /* * "Define 'is'", Bill Clinton * "Define 'if'", Steven Rostedt @@ -97,7 +106,7 @@ extern int sysctl_branch_profiling_enabled; .line = __LINE__, \ }; \ ______r = !!(cond); \ - ______f.miss_hit[______r]++; \ + branch_profiler(&______f, ______r); \ ______r; \ })) #endif /* CONFIG_PROFILE_ALL_BRANCHES */ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 4a13e5a01ce3..309a264bf760 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -283,6 +283,19 @@ config PROFILE_ALL_BRANCHES is to be analyzed endchoice +config PROFILE_BRANCHES_PER_CPU + bool "Profile branches on a per cpu basis" + depends on PROFILE_ALL_BRANCHES + help + When profiling all branches, the system can take a big cache line + bouncing hit. On boxes with lots of CPUs, this can slow the system + down so much that it can live lock. + + This option solves the issue by making the profiler data per CPU. + This avoids the cache line bouncing, but at the cost of having the + branch table copied for every CPU. This table is quite large and + this option will duplicate it for every CPU. + config TRACING_BRANCHES bool help diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index e4f146579850..08e3d90d5c07 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -343,6 +344,38 @@ static int __init set_enable_branch_profiler(char *str) } __setup("enable_branch_profiler", set_enable_branch_profiler); +static unsigned long branch_count; + +#ifdef CONFIG_PROFILE_BRANCHES_PER_CPU +#define branch_percpu(p, cpu) SHIFT_PERCPU_PTR(p, per_cpu_offset(cpu)) +void branch_profiler(struct ftrace_branch_data *data, int cond) +{ + branch_percpu(data, raw_smp_processor_id())->miss_hit[cond]++; +} +static void calculate_stat(struct ftrace_branch_data *stat, + struct ftrace_branch_data *p) +{ + int rec = 0; + int cpu; + + for_each_online_cpu(cpu) { + if (!rec) { + rec++; + *stat = *branch_percpu(p, cpu); + } else { + stat->miss_hit[0] += branch_percpu(p, cpu)->miss_hit[0]; + stat->miss_hit[1] += branch_percpu(p, cpu)->miss_hit[1]; + } + } +} +#else +static void calculate_stat(struct ftrace_branch_data *stat, + struct ftrace_branch_data *p) +{ + *stat = *p; +} +#endif + extern unsigned long __start_branch_profile[]; extern unsigned long __stop_branch_profile[]; @@ -357,6 +390,11 @@ static int all_branch_stat_headers(struct seq_file *m) return 0; } +struct ftrace_branch_stat { + struct tracer_stat stat; + int cpu; +}; + static void *all_branch_stat_start(struct tracer_stat *trace) { return __start_branch_profile; @@ -369,20 +407,36 @@ all_branch_stat_next(void *v, int idx) ++p; - if ((void *)p >= (void *)__stop_branch_profile) + if (idx >= branch_count) return NULL; return p; } +static int all_branch_stat_show(struct seq_file *m, void *v) +{ + struct ftrace_branch_data *p = v; + struct ftrace_branch_data stat; + + calculate_stat(&stat, p); + + return branch_stat_show(m, &stat); +} + static struct tracer_stat all_branch_stats = { .name = "branch_all", .stat_start = all_branch_stat_start, .stat_next = all_branch_stat_next, .stat_headers = all_branch_stat_headers, - .stat_show = branch_stat_show + .stat_show = all_branch_stat_show }; +static void calculate_branch_count(void) +{ + branch_count = ((unsigned long)&__stop_branch_profile - + (unsigned long)&__start_branch_profile) / + sizeof(struct ftrace_branch_stat); +} __init static int all_annotated_branch_stats(void) { int ret; @@ -393,7 +447,11 @@ __init static int all_annotated_branch_stats(void) "all branches stats\n"); return 1; } + + calculate_branch_count(); + return 0; } + fs_initcall(all_annotated_branch_stats); #endif /* CONFIG_PROFILE_ALL_BRANCHES */ -- cgit v1.2.3 From cb0473e6be6f68fbf7ddc95c9c3ff7610b8598ca Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 6 Jun 2009 16:24:33 +0200 Subject: tracing/branch-tracer: PROFILE_BRANCHES_PER_CPU must depend on SMP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While enabling the per cpu branch tracing in a non SMP config, we get the following build failure: kernel/trace/trace_branch.c: In function ‘branch_profiler’: kernel/trace/trace_branch.c:353: error: implicit declaration of function ‘SHIFT_PERCPU_PTR’ kernel/trace/trace_branch.c:353: error: implicit declaration of function ‘per_cpu_offset’ kernel/trace/trace_branch.c:353: error: invalid type argument of ‘->’ (have ‘int’) kernel/trace/trace_branch.c: In function ‘calculate_stat’: kernel/trace/trace_branch.c:364: error: invalid type argument of ‘unary *’ (have ‘int’) kernel/trace/trace_branch.c:366: error: invalid type argument of ‘->’ (have ‘int’) kernel/trace/trace_branch.c:367: error: invalid type argument of ‘->’ (have ‘int’) This is because the branch tracer uses per cpu internals that are not supported on !CONFIG_SMP case. To fix it we just need to disable the per cpu branch tracing in !CONFIG_SMP case. [ Impact: fix a build failure with the branch tracer ] Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 309a264bf760..0efbcc560bf1 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -285,7 +285,7 @@ endchoice config PROFILE_BRANCHES_PER_CPU bool "Profile branches on a per cpu basis" - depends on PROFILE_ALL_BRANCHES + depends on PROFILE_ALL_BRANCHES && SMP help When profiling all branches, the system can take a big cache line bouncing hit. On boxes with lots of CPUs, this can slow the system -- cgit v1.2.3 From 849620fab413355eff48232eac5a8c53c57615c5 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 14 May 2009 17:10:52 +0200 Subject: Revert "oprofile: discover counters for op ppro too" This reverts commit 59512900baab03c5629f2ff5efad1d5d4e682ece. arch_perfmon_setup_counters() is actually never called for ppro, so there is no code that changes the numbers in op_ppro_spec. The patch as it is has no effect. Cc: Andi Kleen Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_ppro.c | 8 +++----- arch/x86/oprofile/op_x86_model.h | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 10131fbdaada..2a123990a84c 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -213,9 +213,9 @@ static void ppro_shutdown(struct op_msrs const * const msrs) } -struct op_x86_model_spec op_ppro_spec = { - .num_counters = 2, /* can be overriden */ - .num_controls = 2, /* dito */ +struct op_x86_model_spec const op_ppro_spec = { + .num_counters = 2, + .num_controls = 2, .fill_in_addresses = &ppro_fill_in_addresses, .setup_ctrs = &ppro_setup_ctrs, .check_ctrs = &ppro_check_ctrs, @@ -251,8 +251,6 @@ void arch_perfmon_setup_counters(void) op_arch_perfmon_spec.num_counters = num_counters; op_arch_perfmon_spec.num_controls = num_counters; - op_ppro_spec.num_counters = num_counters; - op_ppro_spec.num_controls = num_counters; } struct op_x86_model_spec op_arch_perfmon_spec = { diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 825e79064d64..2317149c94f0 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -45,7 +45,7 @@ struct op_x86_model_spec { void (*shutdown)(struct op_msrs const * const msrs); }; -extern struct op_x86_model_spec op_ppro_spec; +extern struct op_x86_model_spec const op_ppro_spec; extern struct op_x86_model_spec const op_p4_spec; extern struct op_x86_model_spec const op_p4_ht2_spec; extern struct op_x86_model_spec const op_amd_spec; -- cgit v1.2.3 From e419294ed3c98cccc145202e4fe165bfd8099d63 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Sun, 12 Oct 2008 15:12:34 -0400 Subject: x86/oprofile: moving arch_perfmon counter setup to op_x86_model_spec.init The function arch_perfmon_init() in nmi_int.c is model specific. This patch moves it to op_model_ppro.c by using the init function pointer in struct op_x86_model_spec. Cc: Andi Kleen Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 21 +++++++++------------ arch/x86/oprofile/op_model_ppro.c | 9 ++++++++- arch/x86/oprofile/op_x86_model.h | 2 -- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 3b285e656e27..dd8515301fbf 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -427,7 +427,7 @@ static int __init ppro_init(char **cpu_type) *cpu_type = "i386/core_2"; break; case 26: - arch_perfmon_setup_counters(); + model = &op_arch_perfmon_spec; *cpu_type = "i386/core_i7"; break; case 28: @@ -442,16 +442,6 @@ static int __init ppro_init(char **cpu_type) return 1; } -static int __init arch_perfmon_init(char **cpu_type) -{ - if (!cpu_has_arch_perfmon) - return 0; - *cpu_type = "i386/arch_perfmon"; - model = &op_arch_perfmon_spec; - arch_perfmon_setup_counters(); - return 1; -} - /* in order to get sysfs right */ static int using_nmi; @@ -509,8 +499,15 @@ int __init op_nmi_init(struct oprofile_operations *ops) break; } - if (!cpu_type && !arch_perfmon_init(&cpu_type)) + if (cpu_type) + break; + + if (!cpu_has_arch_perfmon) return -ENODEV; + + /* use arch perfmon as fallback */ + cpu_type = "i386/arch_perfmon"; + model = &op_arch_perfmon_spec; break; default: diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 2a123990a84c..ae5811966883 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -233,7 +233,7 @@ struct op_x86_model_spec const op_ppro_spec = { * the specific CPU. */ -void arch_perfmon_setup_counters(void) +static void arch_perfmon_setup_counters(void) { union cpuid10_eax eax; @@ -253,7 +253,14 @@ void arch_perfmon_setup_counters(void) op_arch_perfmon_spec.num_controls = num_counters; } +static int arch_perfmon_init(struct oprofile_operations *ignore) +{ + arch_perfmon_setup_counters(); + return 0; +} + struct op_x86_model_spec op_arch_perfmon_spec = { + .init = &arch_perfmon_init, /* num_counters/num_controls filled in at runtime */ .fill_in_addresses = &ppro_fill_in_addresses, /* user space does the cpuid check for available events */ diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 2317149c94f0..ed27783bb0d4 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -51,6 +51,4 @@ extern struct op_x86_model_spec const op_p4_ht2_spec; extern struct op_x86_model_spec const op_amd_spec; extern struct op_x86_model_spec op_arch_perfmon_spec; -extern void arch_perfmon_setup_counters(void); - #endif /* OP_X86_MODEL_H */ -- cgit v1.2.3 From 06552ccc36abeb12e37efc16c384dc7f30794f85 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 28 May 2009 02:12:36 +0200 Subject: x86/oprofile: minor style changes in struct op_x86_model_spec Some vertical alignments. Variables are now located in the beginning of the struct. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_x86_model.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index ed27783bb0d4..bd8157d12ff0 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -32,17 +32,17 @@ struct pt_regs; * various x86 CPU models' perfctr support. */ struct op_x86_model_spec { - int (*init)(struct oprofile_operations *ops); - void (*exit)(void); - unsigned int num_counters; - unsigned int num_controls; - void (*fill_in_addresses)(struct op_msrs * const msrs); - void (*setup_ctrs)(struct op_msrs const * const msrs); - int (*check_ctrs)(struct pt_regs * const regs, - struct op_msrs const * const msrs); - void (*start)(struct op_msrs const * const msrs); - void (*stop)(struct op_msrs const * const msrs); - void (*shutdown)(struct op_msrs const * const msrs); + unsigned int num_counters; + unsigned int num_controls; + int (*init)(struct oprofile_operations *ops); + void (*exit)(void); + void (*fill_in_addresses)(struct op_msrs * const msrs); + void (*setup_ctrs)(struct op_msrs const * const msrs); + int (*check_ctrs)(struct pt_regs * const regs, + struct op_msrs const * const msrs); + void (*start)(struct op_msrs const * const msrs); + void (*stop)(struct op_msrs const * const msrs); + void (*shutdown)(struct op_msrs const * const msrs); }; extern struct op_x86_model_spec const op_ppro_spec; -- cgit v1.2.3 From 0dc8335aa3e59f1adbb0fce7c9c0b342146cd036 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 7 May 2009 15:09:33 +0200 Subject: oprofile: remove irq_flags in struct op_entry This became obsolete with this commit: 304cc6a ring_buffer: remove unused flags parameter, fix Signed-off-by: Robert Richter --- include/linux/oprofile.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h index 1d9518bc4c58..dbbe2dbc4418 100644 --- a/include/linux/oprofile.h +++ b/include/linux/oprofile.h @@ -171,7 +171,6 @@ struct op_sample; struct op_entry { struct ring_buffer_event *event; struct op_sample *sample; - unsigned long irq_flags; unsigned long size; unsigned long *data; }; -- cgit v1.2.3 From fecfe6320ba71eed6d16849683298f0894aa60de Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 7 May 2009 16:07:41 +0200 Subject: oprofile: remove obselete include headers This became obsolete with this commit: 6dad828 oprofile: port to the new ring_buffer Signed-off-by: Robert Richter --- drivers/oprofile/cpu_buffer.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c index 242257b19441..50640cc5eef2 100644 --- a/drivers/oprofile/cpu_buffer.c +++ b/drivers/oprofile/cpu_buffer.c @@ -21,7 +21,6 @@ #include #include -#include #include #include "event_buffer.h" -- cgit v1.2.3 From 9063759540daac40cc1f402f83a3be6b489f8583 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 10 Mar 2009 19:15:57 +0100 Subject: x86/oprofile: remove #ifdefs in ibs functions IBS code is moved to separate functions. This allows the removal of #ifdefs in functions. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 80 +++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 34 deletions(-) diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 8fdf06e4edf9..b54c0880b7dc 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -220,6 +220,50 @@ op_amd_handle_ibs(struct pt_regs * const regs, return 1; } +static inline void op_amd_start_ibs(void) +{ + unsigned int low, high; + if (has_ibs && ibs_config.fetch_enabled) { + low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; + high = ((ibs_config.rand_en & 0x1) << 25) /* bit 57 */ + + IBS_FETCH_HIGH_ENABLE; + wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + } + + if (has_ibs && ibs_config.op_enabled) { + low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) + + ((ibs_config.dispatched_ops & 0x1) << 19) /* bit 19 */ + + IBS_OP_LOW_ENABLE; + high = 0; + wrmsr(MSR_AMD64_IBSOPCTL, low, high); + } +} + +static void op_amd_stop_ibs(void) +{ + unsigned int low, high; + if (has_ibs && ibs_config.fetch_enabled) { + /* clear max count and enable */ + low = 0; + high = 0; + wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + } + + if (has_ibs && ibs_config.op_enabled) { + /* clear max count and enable */ + low = 0; + high = 0; + wrmsr(MSR_AMD64_IBSOPCTL, low, high); + } +} + +#else + +static inline int op_amd_handle_ibs(struct pt_regs * const regs, + struct op_msrs const * const msrs) { } +static inline void op_amd_start_ibs(void) { } +static inline void op_amd_stop_ibs(void) { } + #endif static int op_amd_check_ctrs(struct pt_regs * const regs, @@ -238,9 +282,7 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, } } -#ifdef CONFIG_OPROFILE_IBS op_amd_handle_ibs(regs, msrs); -#endif /* See op_model_ppro.c */ return 1; @@ -258,25 +300,9 @@ static void op_amd_start(struct op_msrs const * const msrs) } } -#ifdef CONFIG_OPROFILE_IBS - if (has_ibs && ibs_config.fetch_enabled) { - low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; - high = ((ibs_config.rand_en & 0x1) << 25) /* bit 57 */ - + IBS_FETCH_HIGH_ENABLE; - wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); - } - - if (has_ibs && ibs_config.op_enabled) { - low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) - + ((ibs_config.dispatched_ops & 0x1) << 19) /* bit 19 */ - + IBS_OP_LOW_ENABLE; - high = 0; - wrmsr(MSR_AMD64_IBSOPCTL, low, high); - } -#endif + op_amd_start_ibs(); } - static void op_amd_stop(struct op_msrs const * const msrs) { unsigned int low, high; @@ -294,21 +320,7 @@ static void op_amd_stop(struct op_msrs const * const msrs) CTRL_WRITE(low, high, msrs, i); } -#ifdef CONFIG_OPROFILE_IBS - if (has_ibs && ibs_config.fetch_enabled) { - /* clear max count and enable */ - low = 0; - high = 0; - wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); - } - - if (has_ibs && ibs_config.op_enabled) { - /* clear max count and enable */ - low = 0; - high = 0; - wrmsr(MSR_AMD64_IBSOPCTL, low, high); - } -#endif + op_amd_stop_ibs(); } static void op_amd_shutdown(struct op_msrs const * const msrs) -- cgit v1.2.3 From d20f24c66011f8a397bca6c5d1a6a7c7e612d2d7 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Sun, 11 Jan 2009 13:01:16 +0100 Subject: x86/oprofile: simplify AMD cpu init code Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index dd8515301fbf..ae0ab03959b4 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -460,27 +460,26 @@ int __init op_nmi_init(struct oprofile_operations *ops) /* Needs to be at least an Athlon (or hammer in 32bit mode) */ switch (family) { - default: - return -ENODEV; case 6: - model = &op_amd_spec; cpu_type = "i386/athlon"; break; case 0xf: - model = &op_amd_spec; - /* Actually it could be i386/hammer too, but give - user space an consistent name. */ + /* + * Actually it could be i386/hammer too, but + * give user space an consistent name. + */ cpu_type = "x86-64/hammer"; break; case 0x10: - model = &op_amd_spec; cpu_type = "x86-64/family10"; break; case 0x11: - model = &op_amd_spec; cpu_type = "x86-64/family11h"; break; + default: + return -ENODEV; } + model = &op_amd_spec; break; case X86_VENDOR_INTEL: -- cgit v1.2.3 From ff9faa8b676e195476b86f03fe58db0f01bda8f3 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 22 May 2009 15:36:29 +0200 Subject: x86/oprofile: move common macros to op_x86_model.h There are duplicate macro implementations in model specific code. This patch moves all common macros to op_x86_model.h. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 8 -------- arch/x86/oprofile/op_model_p4.c | 2 -- arch/x86/oprofile/op_model_ppro.c | 8 -------- arch/x86/oprofile/op_x86_model.h | 9 +++++++++ 4 files changed, 9 insertions(+), 18 deletions(-) diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index b54c0880b7dc..4b9254a67e68 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -26,22 +26,14 @@ #define NUM_COUNTERS 4 #define NUM_CONTROLS 4 -#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) #define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) #define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0) #define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) -#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) #define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) #define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) -#define CTRL_SET_ACTIVE(n) (n |= (1<<22)) -#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) #define CTRL_CLEAR_LO(x) (x &= (1<<21)) #define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0) -#define CTRL_SET_ENABLE(val) (val |= 1<<20) -#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16)) -#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17)) -#define CTRL_SET_UM(val, m) (val |= (m << 8)) #define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff)) #define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf)) #define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9)) diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 819b131fd752..420c15e71237 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -366,8 +366,6 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { #define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) #define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) -#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) -#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) #define CTR_READ(l, h, i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h)); } while (0) #define CTR_WRITE(l, i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1); } while (0) #define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000)) diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index ae5811966883..a922a1a815c3 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -26,19 +26,11 @@ static int num_counters = 2; static int counter_width = 32; -#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) #define CTR_OVERFLOWED(n) (!((n) & (1ULL<<(counter_width-1)))) -#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) #define CTRL_READ(l, h, msrs, c) do {rdmsr((msrs->controls[(c)].addr), (l), (h)); } while (0) #define CTRL_WRITE(l, h, msrs, c) do {wrmsr((msrs->controls[(c)].addr), (l), (h)); } while (0) -#define CTRL_SET_ACTIVE(n) (n |= (1<<22)) -#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) #define CTRL_CLEAR(x) (x &= (1<<21)) -#define CTRL_SET_ENABLE(val) (val |= 1<<20) -#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16)) -#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17)) -#define CTRL_SET_UM(val, m) (val |= (m << 8)) #define CTRL_SET_EVENT(val, e) (val |= e) static u64 *reset_value; diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index bd8157d12ff0..c80ec7d09993 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -11,6 +11,15 @@ #ifndef OP_X86_MODEL_H #define OP_X86_MODEL_H +#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) +#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) +#define CTRL_SET_ACTIVE(n) (n |= (1<<22)) +#define CTRL_SET_ENABLE(val) (val |= 1<<20) +#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) +#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17)) +#define CTRL_SET_UM(val, m) (val |= (m << 8)) +#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16)) + struct op_saved_msr { unsigned int high; unsigned int low; -- cgit v1.2.3 From d2731a4387ad6c6bca07abfe9ed41d450fb6d665 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 22 May 2009 19:47:38 +0200 Subject: x86/oprofile: remove MSR macros for AMD cpus The macros CTRL_READ() and CTRL_WRITE() make the code hard to read and maintain. This patch replaces them by rdmsr()/wrmsr() functions and simplifies the code. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 4b9254a67e68..c6181c265ae0 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -26,12 +26,7 @@ #define NUM_COUNTERS 4 #define NUM_CONTROLS 4 -#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) -#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0) #define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) - -#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) -#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) #define CTRL_CLEAR_LO(x) (x &= (1<<21)) #define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0) #define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff)) @@ -101,17 +96,17 @@ static void op_amd_setup_ctrs(struct op_msrs const * const msrs) for (i = 0 ; i < NUM_CONTROLS; ++i) { if (unlikely(!CTRL_IS_RESERVED(msrs, i))) continue; - CTRL_READ(low, high, msrs, i); + rdmsr(msrs->controls[i].addr, low, high); CTRL_CLEAR_LO(low); CTRL_CLEAR_HI(high); - CTRL_WRITE(low, high, msrs, i); + wrmsr(msrs->controls[i].addr, low, high); } /* avoid a false detection of ctr overflows in NMI handler */ for (i = 0; i < NUM_COUNTERS; ++i) { if (unlikely(!CTR_IS_RESERVED(msrs, i))) continue; - CTR_WRITE(1, msrs, i); + wrmsr(msrs->counters[i].addr, -1, -1); } /* enable active counters */ @@ -119,9 +114,9 @@ static void op_amd_setup_ctrs(struct op_msrs const * const msrs) if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { reset_value[i] = counter_config[i].count; - CTR_WRITE(counter_config[i].count, msrs, i); + wrmsr(msrs->counters[i].addr, -(unsigned int)counter_config[i].count, -1); - CTRL_READ(low, high, msrs, i); + rdmsr(msrs->controls[i].addr, low, high); CTRL_CLEAR_LO(low); CTRL_CLEAR_HI(high); CTRL_SET_ENABLE(low); @@ -133,7 +128,7 @@ static void op_amd_setup_ctrs(struct op_msrs const * const msrs) CTRL_SET_HOST_ONLY(high, 0); CTRL_SET_GUEST_ONLY(high, 0); - CTRL_WRITE(low, high, msrs, i); + wrmsr(msrs->controls[i].addr, low, high); } else { reset_value[i] = 0; } @@ -267,10 +262,10 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, for (i = 0 ; i < NUM_COUNTERS; ++i) { if (!reset_value[i]) continue; - CTR_READ(low, high, msrs, i); + rdmsr(msrs->counters[i].addr, low, high); if (CTR_OVERFLOWED(low)) { oprofile_add_sample(regs, i); - CTR_WRITE(reset_value[i], msrs, i); + wrmsr(msrs->counters[i].addr, -(unsigned int)reset_value[i], -1); } } @@ -286,9 +281,9 @@ static void op_amd_start(struct op_msrs const * const msrs) int i; for (i = 0 ; i < NUM_COUNTERS ; ++i) { if (reset_value[i]) { - CTRL_READ(low, high, msrs, i); + rdmsr(msrs->controls[i].addr, low, high); CTRL_SET_ACTIVE(low); - CTRL_WRITE(low, high, msrs, i); + wrmsr(msrs->controls[i].addr, low, high); } } @@ -307,9 +302,9 @@ static void op_amd_stop(struct op_msrs const * const msrs) for (i = 0 ; i < NUM_COUNTERS ; ++i) { if (!reset_value[i]) continue; - CTRL_READ(low, high, msrs, i); + rdmsr(msrs->controls[i].addr, low, high); CTRL_SET_INACTIVE(low); - CTRL_WRITE(low, high, msrs, i); + wrmsr(msrs->controls[i].addr, low, high); } op_amd_stop_ibs(); -- cgit v1.2.3 From 74c9a5c341bb1f6cbb5095b07c77230f19682ce8 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 22 May 2009 19:47:38 +0200 Subject: x86/oprofile: remove MSR macros for ppro cpus The macros CTRL_READ() and CTRL_WRITE() make the code hard to read and maintain. This patch replaces them by rdmsr()/wrmsr() functions and simplifies the code. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_ppro.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index a922a1a815c3..6c5d288c566e 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -27,9 +27,6 @@ static int num_counters = 2; static int counter_width = 32; #define CTR_OVERFLOWED(n) (!((n) & (1ULL<<(counter_width-1)))) - -#define CTRL_READ(l, h, msrs, c) do {rdmsr((msrs->controls[(c)].addr), (l), (h)); } while (0) -#define CTRL_WRITE(l, h, msrs, c) do {wrmsr((msrs->controls[(c)].addr), (l), (h)); } while (0) #define CTRL_CLEAR(x) (x &= (1<<21)) #define CTRL_SET_EVENT(val, e) (val |= e) @@ -88,9 +85,9 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs) for (i = 0 ; i < num_counters; ++i) { if (unlikely(!CTRL_IS_RESERVED(msrs, i))) continue; - CTRL_READ(low, high, msrs, i); + rdmsr(msrs->controls[i].addr, low, high); CTRL_CLEAR(low); - CTRL_WRITE(low, high, msrs, i); + wrmsr(msrs->controls[i].addr, low, high); } /* avoid a false detection of ctr overflows in NMI handler */ @@ -107,14 +104,14 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs) wrmsrl(msrs->counters[i].addr, -reset_value[i]); - CTRL_READ(low, high, msrs, i); + rdmsr(msrs->controls[i].addr, low, high); CTRL_CLEAR(low); CTRL_SET_ENABLE(low); CTRL_SET_USR(low, counter_config[i].user); CTRL_SET_KERN(low, counter_config[i].kernel); CTRL_SET_UM(low, counter_config[i].unit_mask); CTRL_SET_EVENT(low, counter_config[i].event); - CTRL_WRITE(low, high, msrs, i); + wrmsr(msrs->controls[i].addr, low, high); } else { reset_value[i] = 0; } @@ -162,9 +159,9 @@ static void ppro_start(struct op_msrs const * const msrs) return; for (i = 0; i < num_counters; ++i) { if (reset_value[i]) { - CTRL_READ(low, high, msrs, i); + rdmsr(msrs->controls[i].addr, low, high); CTRL_SET_ACTIVE(low); - CTRL_WRITE(low, high, msrs, i); + wrmsr(msrs->controls[i].addr, low, high); } } } @@ -180,9 +177,9 @@ static void ppro_stop(struct op_msrs const * const msrs) for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; - CTRL_READ(low, high, msrs, i); + rdmsr(msrs->controls[i].addr, low, high); CTRL_SET_INACTIVE(low); - CTRL_WRITE(low, high, msrs, i); + wrmsr(msrs->controls[i].addr, low, high); } } -- cgit v1.2.3 From 1131a478245b00664ae2dbc0f68db987b51fa806 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 25 May 2009 20:23:23 +0200 Subject: x86/oprofile: remove MSR macros for p4 cpus The macros CTRL_READ() and CTRL_WRITE() make the code hard to read and maintain. This patch replaces them by rdmsr()/wrmsr() functions and simplifies the code. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_p4.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 420c15e71237..365d8a9c03d3 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -350,8 +350,6 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { #define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1)) #define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25)) #define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9)) -#define ESCR_READ(escr, high, ev, i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0) -#define ESCR_WRITE(escr, high, ev, i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0) #define CCCR_RESERVED_BITS 0x38030FFF #define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS) @@ -361,13 +359,9 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { #define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27)) #define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12)) #define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12)) -#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0) -#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0) #define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) #define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) -#define CTR_READ(l, h, i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h)); } while (0) -#define CTR_WRITE(l, i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1); } while (0) #define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000)) @@ -513,7 +507,7 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) if (ev->bindings[i].virt_counter & counter_bit) { /* modify ESCR */ - ESCR_READ(escr, high, ev, i); + rdmsr(ev->bindings[i].escr_address, escr, high); ESCR_CLEAR(escr); if (stag == 0) { ESCR_SET_USR_0(escr, counter_config[ctr].user); @@ -524,10 +518,11 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) } ESCR_SET_EVENT_SELECT(escr, ev->event_select); ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); - ESCR_WRITE(escr, high, ev, i); + wrmsr(ev->bindings[i].escr_address, escr, high); /* modify CCCR */ - CCCR_READ(cccr, high, VIRT_CTR(stag, ctr)); + rdmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address, + cccr, high); CCCR_CLEAR(cccr); CCCR_SET_REQUIRED_BITS(cccr); CCCR_SET_ESCR_SELECT(cccr, ev->escr_select); @@ -535,7 +530,8 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) CCCR_SET_PMI_OVF_0(cccr); else CCCR_SET_PMI_OVF_1(cccr); - CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr)); + wrmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address, + cccr, high); return; } } @@ -582,7 +578,8 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs) if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs, i))) { reset_value[i] = counter_config[i].count; pmc_setup_one_p4_counter(i); - CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i)); + wrmsr(p4_counters[VIRT_CTR(stag, i)].counter_address, + -(u32)counter_config[i].count, -1); } else { reset_value[i] = 0; } @@ -622,14 +619,16 @@ static int p4_check_ctrs(struct pt_regs * const regs, real = VIRT_CTR(stag, i); - CCCR_READ(low, high, real); - CTR_READ(ctr, high, real); + rdmsr(p4_counters[real].cccr_address, low, high); + rdmsr(p4_counters[real].counter_address, ctr, high); if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) { oprofile_add_sample(regs, i); - CTR_WRITE(reset_value[i], real); + wrmsr(p4_counters[real].counter_address, + -(u32)reset_value[i], -1); CCCR_CLEAR_OVF(low); - CCCR_WRITE(low, high, real); - CTR_WRITE(reset_value[i], real); + wrmsr(p4_counters[real].cccr_address, low, high); + wrmsr(p4_counters[real].counter_address, + -(u32)reset_value[i], -1); } } @@ -651,9 +650,9 @@ static void p4_start(struct op_msrs const * const msrs) for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; - CCCR_READ(low, high, VIRT_CTR(stag, i)); + rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); CCCR_SET_ENABLE(low); - CCCR_WRITE(low, high, VIRT_CTR(stag, i)); + wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); } } @@ -668,9 +667,9 @@ static void p4_stop(struct op_msrs const * const msrs) for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; - CCCR_READ(low, high, VIRT_CTR(stag, i)); + rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); CCCR_SET_DISABLE(low); - CCCR_WRITE(low, high, VIRT_CTR(stag, i)); + wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); } } -- cgit v1.2.3 From ec064c093e254f4433afb17dcef7f964c76436af Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 25 May 2009 15:05:50 +0200 Subject: x86/oprofile: fix and cleanup CTRL_SET_* macros This patch fixes missing braces around macro parameters. Macro definitions from intel_arch_perfmon.h are used where possible. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_ppro.c | 1 - arch/x86/oprofile/op_x86_model.h | 18 ++++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 6c5d288c566e..61ee8f640535 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -18,7 +18,6 @@ #include #include #include -#include #include "op_x86_model.h" #include "op_counter.h" diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index c80ec7d09993..a207b1c46e26 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -11,14 +11,16 @@ #ifndef OP_X86_MODEL_H #define OP_X86_MODEL_H -#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) -#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) -#define CTRL_SET_ACTIVE(n) (n |= (1<<22)) -#define CTRL_SET_ENABLE(val) (val |= 1<<20) -#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) -#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17)) -#define CTRL_SET_UM(val, m) (val |= (m << 8)) -#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16)) +#include + +#define CTR_IS_RESERVED(msrs, c) ((msrs)->counters[(c)].addr ? 1 : 0) +#define CTRL_IS_RESERVED(msrs, c) ((msrs)->controls[(c)].addr ? 1 : 0) +#define CTRL_SET_ACTIVE(val) ((val) |= ARCH_PERFMON_EVENTSEL0_ENABLE) +#define CTRL_SET_ENABLE(val) ((val) |= ARCH_PERFMON_EVENTSEL_INT) +#define CTRL_SET_INACTIVE(val) ((val) &= ~ARCH_PERFMON_EVENTSEL0_ENABLE) +#define CTRL_SET_KERN(val, k) ((val) |= ((k) ? ARCH_PERFMON_EVENTSEL_OS : 0)) +#define CTRL_SET_USR(val, u) ((val) |= ((u) ? ARCH_PERFMON_EVENTSEL_USR : 0)) +#define CTRL_SET_UM(val, m) ((val) |= ((m) << 8)) struct op_saved_msr { unsigned int high; -- cgit v1.2.3 From 9c59354b48ce9cf28048b02fea73dd0236f876ea Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 25 May 2009 18:16:43 +0200 Subject: x86/oprofile: remove unused macros for AMD virtualization profiling The use of the macros has no effect. The oprofilefs has to be extended first to support these features. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index c6181c265ae0..aaa7ffaed6b9 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -31,8 +31,6 @@ #define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0) #define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff)) #define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf)) -#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9)) -#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) static unsigned long reset_value[NUM_COUNTERS]; @@ -125,9 +123,6 @@ static void op_amd_setup_ctrs(struct op_msrs const * const msrs) CTRL_SET_UM(low, counter_config[i].unit_mask); CTRL_SET_EVENT_LOW(low, counter_config[i].event); CTRL_SET_EVENT_HIGH(high, counter_config[i].event); - CTRL_SET_HOST_ONLY(high, 0); - CTRL_SET_GUEST_ONLY(high, 0); - wrmsr(msrs->controls[i].addr, low, high); } else { reset_value[i] = 0; -- cgit v1.2.3 From ef8828ddf828174785421af67c281144d4b8e796 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 25 May 2009 19:31:44 +0200 Subject: x86/oprofile: pass the model to setup_ctrs() functions In follow-on patches the setup_ctrs() functions will need data that describes the model. This patch extends the function argument list to pass a pointer of the model to these function. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 2 +- arch/x86/oprofile/op_model_amd.c | 3 ++- arch/x86/oprofile/op_model_p4.c | 3 ++- arch/x86/oprofile/op_model_ppro.c | 3 ++- arch/x86/oprofile/op_x86_model.h | 3 ++- 5 files changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index ae0ab03959b4..c31f87bbf436 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -125,7 +125,7 @@ static void nmi_cpu_setup(void *dummy) int cpu = smp_processor_id(); struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); spin_lock(&oprofilefs_lock); - model->setup_ctrs(msrs); + model->setup_ctrs(model, msrs); spin_unlock(&oprofilefs_lock); per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC); apic_write(APIC_LVTPC, APIC_DM_NMI); diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index aaa7ffaed6b9..86e0a01ba125 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -85,7 +85,8 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) } -static void op_amd_setup_ctrs(struct op_msrs const * const msrs) +static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) { unsigned int low, high; int i; diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 365d8a9c03d3..05ba0287b1f7 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -542,7 +542,8 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) } -static void p4_setup_ctrs(struct op_msrs const * const msrs) +static void p4_setup_ctrs(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) { unsigned int i; unsigned int low, high; diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 61ee8f640535..40b44ee521d5 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -51,7 +51,8 @@ static void ppro_fill_in_addresses(struct op_msrs * const msrs) } -static void ppro_setup_ctrs(struct op_msrs const * const msrs) +static void ppro_setup_ctrs(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) { unsigned int low, high; int i; diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index a207b1c46e26..6161c7f0e7fb 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -48,7 +48,8 @@ struct op_x86_model_spec { int (*init)(struct oprofile_operations *ops); void (*exit)(void); void (*fill_in_addresses)(struct op_msrs * const msrs); - void (*setup_ctrs)(struct op_msrs const * const msrs); + void (*setup_ctrs)(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs); int (*check_ctrs)(struct pt_regs * const regs, struct op_msrs const * const msrs); void (*start)(struct op_msrs const * const msrs); -- cgit v1.2.3 From 3370d358569755625aba4d9a846a040ce691d9ed Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 25 May 2009 15:10:32 +0200 Subject: x86/oprofile: replace macros to calculate control register This patch introduces op_x86_get_ctrl() to calculate the value of the performance control register. This is generic code usable for all models. The event and reserved masks are model specific and stored in struct op_x86_model_spec. 64 bit MSR functions are used now. The patch removes many hard to read macros used for ctrl calculation. The function op_x86_get_ctrl() is common code and the first step to further merge performance counter implementations for x86 models. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 20 +++++++++++++++++++ arch/x86/oprofile/op_model_amd.c | 41 +++++++++++++++------------------------ arch/x86/oprofile/op_model_ppro.c | 29 +++++++++++++-------------- arch/x86/oprofile/op_x86_model.h | 15 ++++++++++---- 4 files changed, 60 insertions(+), 45 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index c31f87bbf436..388ee15e0e42 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -31,6 +31,26 @@ static DEFINE_PER_CPU(unsigned long, saved_lvtpc); /* 0 == registered but off, 1 == registered and on */ static int nmi_enabled = 0; +/* common functions */ + +u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, + struct op_counter_config *counter_config) +{ + u64 val = 0; + u16 event = (u16)counter_config->event; + + val |= ARCH_PERFMON_EVENTSEL_INT; + val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0; + val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0; + val |= (counter_config->unit_mask & 0xFF) << 8; + event &= model->event_mask ? model->event_mask : 0xFF; + val |= event & 0xFF; + val |= (event & 0x0F00) << 24; + + return val; +} + + static int profile_exceptions_notify(struct notifier_block *self, unsigned long val, void *data) { diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 86e0a01ba125..2406ab863605 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -25,12 +25,11 @@ #define NUM_COUNTERS 4 #define NUM_CONTROLS 4 +#define OP_EVENT_MASK 0x0FFF + +#define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21)) #define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) -#define CTRL_CLEAR_LO(x) (x &= (1<<21)) -#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0) -#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff)) -#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf)) static unsigned long reset_value[NUM_COUNTERS]; @@ -84,21 +83,19 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) } } - static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; /* clear all counters */ for (i = 0 ; i < NUM_CONTROLS; ++i) { if (unlikely(!CTRL_IS_RESERVED(msrs, i))) continue; - rdmsr(msrs->controls[i].addr, low, high); - CTRL_CLEAR_LO(low); - CTRL_CLEAR_HI(high); - wrmsr(msrs->controls[i].addr, low, high); + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + wrmsrl(msrs->controls[i].addr, val); } /* avoid a false detection of ctr overflows in NMI handler */ @@ -112,19 +109,11 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, for (i = 0; i < NUM_COUNTERS; ++i) { if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { reset_value[i] = counter_config[i].count; - wrmsr(msrs->counters[i].addr, -(unsigned int)counter_config[i].count, -1); - - rdmsr(msrs->controls[i].addr, low, high); - CTRL_CLEAR_LO(low); - CTRL_CLEAR_HI(high); - CTRL_SET_ENABLE(low); - CTRL_SET_USR(low, counter_config[i].user); - CTRL_SET_KERN(low, counter_config[i].kernel); - CTRL_SET_UM(low, counter_config[i].unit_mask); - CTRL_SET_EVENT_LOW(low, counter_config[i].event); - CTRL_SET_EVENT_HIGH(high, counter_config[i].event); - wrmsr(msrs->controls[i].addr, low, high); + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[i]); + wrmsrl(msrs->controls[i].addr, val); } else { reset_value[i] = 0; } @@ -486,14 +475,16 @@ static void op_amd_exit(void) {} #endif /* CONFIG_OPROFILE_IBS */ struct op_x86_model_spec const op_amd_spec = { - .init = op_amd_init, - .exit = op_amd_exit, .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, + .reserved = MSR_AMD_EVENTSEL_RESERVED, + .event_mask = OP_EVENT_MASK, + .init = op_amd_init, + .exit = op_amd_exit, .fill_in_addresses = &op_amd_fill_in_addresses, .setup_ctrs = &op_amd_setup_ctrs, .check_ctrs = &op_amd_check_ctrs, .start = &op_amd_start, .stop = &op_amd_stop, - .shutdown = &op_amd_shutdown + .shutdown = &op_amd_shutdown, }; diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 40b44ee521d5..3092f998baf2 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -10,6 +10,7 @@ * @author Philippe Elie * @author Graydon Hoare * @author Andi Kleen + * @author Robert Richter */ #include @@ -26,8 +27,8 @@ static int num_counters = 2; static int counter_width = 32; #define CTR_OVERFLOWED(n) (!((n) & (1ULL<<(counter_width-1)))) -#define CTRL_CLEAR(x) (x &= (1<<21)) -#define CTRL_SET_EVENT(val, e) (val |= e) + +#define MSR_PPRO_EVENTSEL_RESERVED ((0xFFFFFFFFULL<<32)|(1ULL<<21)) static u64 *reset_value; @@ -54,7 +55,7 @@ static void ppro_fill_in_addresses(struct op_msrs * const msrs) static void ppro_setup_ctrs(struct op_x86_model_spec const *model, struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; if (!reset_value) { @@ -85,9 +86,9 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, for (i = 0 ; i < num_counters; ++i) { if (unlikely(!CTRL_IS_RESERVED(msrs, i))) continue; - rdmsr(msrs->controls[i].addr, low, high); - CTRL_CLEAR(low); - wrmsr(msrs->controls[i].addr, low, high); + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + wrmsrl(msrs->controls[i].addr, val); } /* avoid a false detection of ctr overflows in NMI handler */ @@ -101,17 +102,11 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, for (i = 0; i < num_counters; ++i) { if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { reset_value[i] = counter_config[i].count; - wrmsrl(msrs->counters[i].addr, -reset_value[i]); - - rdmsr(msrs->controls[i].addr, low, high); - CTRL_CLEAR(low); - CTRL_SET_ENABLE(low); - CTRL_SET_USR(low, counter_config[i].user); - CTRL_SET_KERN(low, counter_config[i].kernel); - CTRL_SET_UM(low, counter_config[i].unit_mask); - CTRL_SET_EVENT(low, counter_config[i].event); - wrmsr(msrs->controls[i].addr, low, high); + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[i]); + wrmsrl(msrs->controls[i].addr, val); } else { reset_value[i] = 0; } @@ -205,6 +200,7 @@ static void ppro_shutdown(struct op_msrs const * const msrs) struct op_x86_model_spec const op_ppro_spec = { .num_counters = 2, .num_controls = 2, + .reserved = MSR_PPRO_EVENTSEL_RESERVED, .fill_in_addresses = &ppro_fill_in_addresses, .setup_ctrs = &ppro_setup_ctrs, .check_ctrs = &ppro_check_ctrs, @@ -249,6 +245,7 @@ static int arch_perfmon_init(struct oprofile_operations *ignore) } struct op_x86_model_spec op_arch_perfmon_spec = { + .reserved = MSR_PPRO_EVENTSEL_RESERVED, .init = &arch_perfmon_init, /* num_counters/num_controls filled in at runtime */ .fill_in_addresses = &ppro_fill_in_addresses, diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 6161c7f0e7fb..3220d4ce6322 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -6,21 +6,19 @@ * @remark Read the file COPYING * * @author Graydon Hoare + * @author Robert Richter */ #ifndef OP_X86_MODEL_H #define OP_X86_MODEL_H +#include #include #define CTR_IS_RESERVED(msrs, c) ((msrs)->counters[(c)].addr ? 1 : 0) #define CTRL_IS_RESERVED(msrs, c) ((msrs)->controls[(c)].addr ? 1 : 0) #define CTRL_SET_ACTIVE(val) ((val) |= ARCH_PERFMON_EVENTSEL0_ENABLE) -#define CTRL_SET_ENABLE(val) ((val) |= ARCH_PERFMON_EVENTSEL_INT) #define CTRL_SET_INACTIVE(val) ((val) &= ~ARCH_PERFMON_EVENTSEL0_ENABLE) -#define CTRL_SET_KERN(val, k) ((val) |= ((k) ? ARCH_PERFMON_EVENTSEL_OS : 0)) -#define CTRL_SET_USR(val, u) ((val) |= ((u) ? ARCH_PERFMON_EVENTSEL_USR : 0)) -#define CTRL_SET_UM(val, m) ((val) |= ((m) << 8)) struct op_saved_msr { unsigned int high; @@ -39,12 +37,16 @@ struct op_msrs { struct pt_regs; +struct oprofile_operations; + /* The model vtable abstracts the differences between * various x86 CPU models' perfctr support. */ struct op_x86_model_spec { unsigned int num_counters; unsigned int num_controls; + u64 reserved; + u16 event_mask; int (*init)(struct oprofile_operations *ops); void (*exit)(void); void (*fill_in_addresses)(struct op_msrs * const msrs); @@ -57,6 +59,11 @@ struct op_x86_model_spec { void (*shutdown)(struct op_msrs const * const msrs); }; +struct op_counter_config; + +extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, + struct op_counter_config *counter_config); + extern struct op_x86_model_spec const op_ppro_spec; extern struct op_x86_model_spec const op_p4_spec; extern struct op_x86_model_spec const op_p4_ht2_spec; -- cgit v1.2.3 From 42399adb239d4f1413899cc618ecf640779e79df Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 25 May 2009 17:59:06 +0200 Subject: x86/oprofile: replace CTR_OVERFLOWED macros The patch replaces all CTR_OVERFLOWED macros. 64 bit MSR functions and 64 bit counter values are used now. Thus, it will be easier to later extend the models to use more than 32 bit width counters. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 16 ++++++++-------- arch/x86/oprofile/op_model_p4.c | 6 +++--- arch/x86/oprofile/op_model_ppro.c | 10 ++++------ 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 2406ab863605..b5d678fbf038 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -26,11 +26,10 @@ #define NUM_COUNTERS 4 #define NUM_CONTROLS 4 #define OP_EVENT_MASK 0x0FFF +#define OP_CTR_OVERFLOW (1ULL<<31) #define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21)) -#define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) - static unsigned long reset_value[NUM_COUNTERS]; #ifdef CONFIG_OPROFILE_IBS @@ -241,17 +240,18 @@ static inline void op_amd_stop_ibs(void) { } static int op_amd_check_ctrs(struct pt_regs * const regs, struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; for (i = 0 ; i < NUM_COUNTERS; ++i) { if (!reset_value[i]) continue; - rdmsr(msrs->counters[i].addr, low, high); - if (CTR_OVERFLOWED(low)) { - oprofile_add_sample(regs, i); - wrmsr(msrs->counters[i].addr, -(unsigned int)reset_value[i], -1); - } + rdmsrl(msrs->counters[i].addr, val); + /* bit is clear if overflowed: */ + if (val & OP_CTR_OVERFLOW) + continue; + oprofile_add_sample(regs, i); + wrmsr(msrs->counters[i].addr, -(unsigned int)reset_value[i], -1); } op_amd_handle_ibs(regs, msrs); diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 05ba0287b1f7..ac4ca28b9ed5 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -32,6 +32,8 @@ #define NUM_CCCRS_HT2 9 #define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2) +#define OP_CTR_OVERFLOW (1ULL<<31) + static unsigned int num_counters = NUM_COUNTERS_NON_HT; static unsigned int num_controls = NUM_CONTROLS_NON_HT; @@ -362,8 +364,6 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { #define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) #define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) -#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000)) - /* this assigns a "stagger" to the current CPU, which is used throughout the code in this module as an extra array offset, to select the "even" @@ -622,7 +622,7 @@ static int p4_check_ctrs(struct pt_regs * const regs, rdmsr(p4_counters[real].cccr_address, low, high); rdmsr(p4_counters[real].counter_address, ctr, high); - if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) { + if (CCCR_OVF_P(low) || !(ctr & OP_CTR_OVERFLOW)) { oprofile_add_sample(regs, i); wrmsr(p4_counters[real].counter_address, -(u32)reset_value[i], -1); diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 3092f998baf2..82db396dc3ef 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -26,8 +26,6 @@ static int num_counters = 2; static int counter_width = 32; -#define CTR_OVERFLOWED(n) (!((n) & (1ULL<<(counter_width-1)))) - #define MSR_PPRO_EVENTSEL_RESERVED ((0xFFFFFFFFULL<<32)|(1ULL<<21)) static u64 *reset_value; @@ -124,10 +122,10 @@ static int ppro_check_ctrs(struct pt_regs * const regs, if (!reset_value[i]) continue; rdmsrl(msrs->counters[i].addr, val); - if (CTR_OVERFLOWED(val)) { - oprofile_add_sample(regs, i); - wrmsrl(msrs->counters[i].addr, -reset_value[i]); - } + if (val & (1ULL << (counter_width - 1))) + continue; + oprofile_add_sample(regs, i); + wrmsrl(msrs->counters[i].addr, -reset_value[i]); } /* Only P6 based Pentium M need to re-unmask the apic vector but it -- cgit v1.2.3 From dea3766ca052a4f572b16a23a322553c064d75af Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 25 May 2009 18:11:52 +0200 Subject: x86/oprofile: replace CTRL_SET_*ACTIVE macros The patch replaces all CTRL_SET_*ACTIVE macros. 64 bit MSR functions and 64 bit counter values are used now. The code uses bit masks from . Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 16 ++++++++-------- arch/x86/oprofile/op_model_ppro.c | 16 ++++++++-------- arch/x86/oprofile/op_x86_model.h | 2 -- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index b5d678fbf038..4ac9d283e8d2 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -262,13 +262,13 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, static void op_amd_start(struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; for (i = 0 ; i < NUM_COUNTERS ; ++i) { if (reset_value[i]) { - rdmsr(msrs->controls[i].addr, low, high); - CTRL_SET_ACTIVE(low); - wrmsr(msrs->controls[i].addr, low, high); + rdmsrl(msrs->controls[i].addr, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(msrs->controls[i].addr, val); } } @@ -277,7 +277,7 @@ static void op_amd_start(struct op_msrs const * const msrs) static void op_amd_stop(struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; /* @@ -287,9 +287,9 @@ static void op_amd_stop(struct op_msrs const * const msrs) for (i = 0 ; i < NUM_COUNTERS ; ++i) { if (!reset_value[i]) continue; - rdmsr(msrs->controls[i].addr, low, high); - CTRL_SET_INACTIVE(low); - wrmsr(msrs->controls[i].addr, low, high); + rdmsrl(msrs->controls[i].addr, val); + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(msrs->controls[i].addr, val); } op_amd_stop_ibs(); diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 82db396dc3ef..566b43f0b6c6 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -145,16 +145,16 @@ static int ppro_check_ctrs(struct pt_regs * const regs, static void ppro_start(struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; if (!reset_value) return; for (i = 0; i < num_counters; ++i) { if (reset_value[i]) { - rdmsr(msrs->controls[i].addr, low, high); - CTRL_SET_ACTIVE(low); - wrmsr(msrs->controls[i].addr, low, high); + rdmsrl(msrs->controls[i].addr, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(msrs->controls[i].addr, val); } } } @@ -162,7 +162,7 @@ static void ppro_start(struct op_msrs const * const msrs) static void ppro_stop(struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; if (!reset_value) @@ -170,9 +170,9 @@ static void ppro_stop(struct op_msrs const * const msrs) for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; - rdmsr(msrs->controls[i].addr, low, high); - CTRL_SET_INACTIVE(low); - wrmsr(msrs->controls[i].addr, low, high); + rdmsrl(msrs->controls[i].addr, val); + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(msrs->controls[i].addr, val); } } diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 3220d4ce6322..1c4577795a92 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -17,8 +17,6 @@ #define CTR_IS_RESERVED(msrs, c) ((msrs)->counters[(c)].addr ? 1 : 0) #define CTRL_IS_RESERVED(msrs, c) ((msrs)->controls[(c)].addr ? 1 : 0) -#define CTRL_SET_ACTIVE(val) ((val) |= ARCH_PERFMON_EVENTSEL0_ENABLE) -#define CTRL_SET_INACTIVE(val) ((val) &= ~ARCH_PERFMON_EVENTSEL0_ENABLE) struct op_saved_msr { unsigned int high; -- cgit v1.2.3 From 217d3cfb959756cb493fc03106c0253baa420ce8 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 4 Jun 2009 02:36:44 +0200 Subject: x86/oprofile: replace CTR*_IS_RESERVED macros The patch replaces all CTR*_IS_RESERVED macros. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 10 +++++----- arch/x86/oprofile/op_model_p4.c | 10 +++++----- arch/x86/oprofile/op_model_ppro.c | 10 +++++----- arch/x86/oprofile/op_x86_model.h | 3 --- 4 files changed, 15 insertions(+), 18 deletions(-) diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 4ac9d283e8d2..c5c5eec2fa74 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -90,7 +90,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, /* clear all counters */ for (i = 0 ; i < NUM_CONTROLS; ++i) { - if (unlikely(!CTRL_IS_RESERVED(msrs, i))) + if (unlikely(!msrs->controls[i].addr)) continue; rdmsrl(msrs->controls[i].addr, val); val &= model->reserved; @@ -99,14 +99,14 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, /* avoid a false detection of ctr overflows in NMI handler */ for (i = 0; i < NUM_COUNTERS; ++i) { - if (unlikely(!CTR_IS_RESERVED(msrs, i))) + if (unlikely(!msrs->counters[i].addr)) continue; wrmsr(msrs->counters[i].addr, -1, -1); } /* enable active counters */ for (i = 0; i < NUM_COUNTERS; ++i) { - if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { + if (counter_config[i].enabled && msrs->counters[i].addr) { reset_value[i] = counter_config[i].count; wrmsr(msrs->counters[i].addr, -(unsigned int)counter_config[i].count, -1); rdmsrl(msrs->controls[i].addr, val); @@ -300,11 +300,11 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) int i; for (i = 0 ; i < NUM_COUNTERS ; ++i) { - if (CTR_IS_RESERVED(msrs, i)) + if (msrs->counters[i].addr) release_perfctr_nmi(MSR_K7_PERFCTR0 + i); } for (i = 0 ; i < NUM_CONTROLS ; ++i) { - if (CTRL_IS_RESERVED(msrs, i)) + if (msrs->controls[i].addr) release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); } } diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index ac4ca28b9ed5..9db0ca9af764 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -559,7 +559,7 @@ static void p4_setup_ctrs(struct op_x86_model_spec const *model, /* clear the cccrs we will use */ for (i = 0 ; i < num_counters ; i++) { - if (unlikely(!CTRL_IS_RESERVED(msrs, i))) + if (unlikely(!msrs->controls[i].addr)) continue; rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); CCCR_CLEAR(low); @@ -569,14 +569,14 @@ static void p4_setup_ctrs(struct op_x86_model_spec const *model, /* clear all escrs (including those outside our concern) */ for (i = num_counters; i < num_controls; i++) { - if (unlikely(!CTRL_IS_RESERVED(msrs, i))) + if (unlikely(!msrs->controls[i].addr)) continue; wrmsr(msrs->controls[i].addr, 0, 0); } /* setup all counters */ for (i = 0 ; i < num_counters ; ++i) { - if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs, i))) { + if (counter_config[i].enabled && msrs->controls[i].addr) { reset_value[i] = counter_config[i].count; pmc_setup_one_p4_counter(i); wrmsr(p4_counters[VIRT_CTR(stag, i)].counter_address, @@ -679,7 +679,7 @@ static void p4_shutdown(struct op_msrs const * const msrs) int i; for (i = 0 ; i < num_counters ; ++i) { - if (CTR_IS_RESERVED(msrs, i)) + if (msrs->counters[i].addr) release_perfctr_nmi(msrs->counters[i].addr); } /* @@ -688,7 +688,7 @@ static void p4_shutdown(struct op_msrs const * const msrs) * This saves a few bits. */ for (i = num_counters ; i < num_controls ; ++i) { - if (CTRL_IS_RESERVED(msrs, i)) + if (msrs->controls[i].addr) release_evntsel_nmi(msrs->controls[i].addr); } } diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 566b43f0b6c6..0a261a5c696e 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -82,7 +82,7 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, /* clear all counters */ for (i = 0 ; i < num_counters; ++i) { - if (unlikely(!CTRL_IS_RESERVED(msrs, i))) + if (unlikely(!msrs->controls[i].addr)) continue; rdmsrl(msrs->controls[i].addr, val); val &= model->reserved; @@ -91,14 +91,14 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, /* avoid a false detection of ctr overflows in NMI handler */ for (i = 0; i < num_counters; ++i) { - if (unlikely(!CTR_IS_RESERVED(msrs, i))) + if (unlikely(!msrs->counters[i].addr)) continue; wrmsrl(msrs->counters[i].addr, -1LL); } /* enable active counters */ for (i = 0; i < num_counters; ++i) { - if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { + if (counter_config[i].enabled && msrs->counters[i].addr) { reset_value[i] = counter_config[i].count; wrmsrl(msrs->counters[i].addr, -reset_value[i]); rdmsrl(msrs->controls[i].addr, val); @@ -181,11 +181,11 @@ static void ppro_shutdown(struct op_msrs const * const msrs) int i; for (i = 0 ; i < num_counters ; ++i) { - if (CTR_IS_RESERVED(msrs, i)) + if (msrs->counters[i].addr) release_perfctr_nmi(MSR_P6_PERFCTR0 + i); } for (i = 0 ; i < num_counters ; ++i) { - if (CTRL_IS_RESERVED(msrs, i)) + if (msrs->controls[i].addr) release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); } if (reset_value) { diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 1c4577795a92..69f1eb46e1b3 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -15,9 +15,6 @@ #include #include -#define CTR_IS_RESERVED(msrs, c) ((msrs)->counters[(c)].addr ? 1 : 0) -#define CTRL_IS_RESERVED(msrs, c) ((msrs)->controls[(c)].addr ? 1 : 0) - struct op_saved_msr { unsigned int high; unsigned int low; -- cgit v1.2.3 From bbc5986d2db427fdd61b6116ff8b9ed988e663a8 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 25 May 2009 17:38:19 +0200 Subject: x86/oprofile: use 64 bit wrmsr functions This patch replaces some wrmsr() functions with wrmsrl(). Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 7 ++++--- arch/x86/oprofile/op_model_p4.c | 12 ++++++------ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index c5c5eec2fa74..9bf901762411 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -101,14 +101,15 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, for (i = 0; i < NUM_COUNTERS; ++i) { if (unlikely(!msrs->counters[i].addr)) continue; - wrmsr(msrs->counters[i].addr, -1, -1); + wrmsrl(msrs->counters[i].addr, -1LL); } /* enable active counters */ for (i = 0; i < NUM_COUNTERS; ++i) { if (counter_config[i].enabled && msrs->counters[i].addr) { reset_value[i] = counter_config[i].count; - wrmsr(msrs->counters[i].addr, -(unsigned int)counter_config[i].count, -1); + wrmsrl(msrs->counters[i].addr, + -(s64)counter_config[i].count); rdmsrl(msrs->controls[i].addr, val); val &= model->reserved; val |= op_x86_get_ctrl(model, &counter_config[i]); @@ -251,7 +252,7 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, if (val & OP_CTR_OVERFLOW) continue; oprofile_add_sample(regs, i); - wrmsr(msrs->counters[i].addr, -(unsigned int)reset_value[i], -1); + wrmsrl(msrs->counters[i].addr, -(s64)reset_value[i]); } op_amd_handle_ibs(regs, msrs); diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 9db0ca9af764..f01e53b118fa 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -579,8 +579,8 @@ static void p4_setup_ctrs(struct op_x86_model_spec const *model, if (counter_config[i].enabled && msrs->controls[i].addr) { reset_value[i] = counter_config[i].count; pmc_setup_one_p4_counter(i); - wrmsr(p4_counters[VIRT_CTR(stag, i)].counter_address, - -(u32)counter_config[i].count, -1); + wrmsrl(p4_counters[VIRT_CTR(stag, i)].counter_address, + -(s64)counter_config[i].count); } else { reset_value[i] = 0; } @@ -624,12 +624,12 @@ static int p4_check_ctrs(struct pt_regs * const regs, rdmsr(p4_counters[real].counter_address, ctr, high); if (CCCR_OVF_P(low) || !(ctr & OP_CTR_OVERFLOW)) { oprofile_add_sample(regs, i); - wrmsr(p4_counters[real].counter_address, - -(u32)reset_value[i], -1); + wrmsrl(p4_counters[real].counter_address, + -(s64)reset_value[i]); CCCR_CLEAR_OVF(low); wrmsr(p4_counters[real].cccr_address, low, high); - wrmsr(p4_counters[real].counter_address, - -(u32)reset_value[i], -1); + wrmsrl(p4_counters[real].counter_address, + -(s64)reset_value[i]); } } -- cgit v1.2.3 From 95e74e62c1540b1115fe8cec5b592f22960f2bb2 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 3 Jun 2009 19:09:27 +0200 Subject: x86/oprofile: use 64 bit values to save MSR states This patch removes struct op_saved_msr and replaces it by an u64 variable. This makes code easier and it is possible to use 64 bit MSR functions. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 28 ++++++++-------------------- arch/x86/oprofile/op_x86_model.h | 9 ++------- 2 files changed, 10 insertions(+), 27 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 388ee15e0e42..3b84b789de0b 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -78,19 +78,13 @@ static void nmi_cpu_save_registers(struct op_msrs *msrs) unsigned int i; for (i = 0; i < nr_ctrs; ++i) { - if (counters[i].addr) { - rdmsr(counters[i].addr, - counters[i].saved.low, - counters[i].saved.high); - } + if (counters[i].addr) + rdmsrl(counters[i].addr, counters[i].saved); } for (i = 0; i < nr_ctrls; ++i) { - if (controls[i].addr) { - rdmsr(controls[i].addr, - controls[i].saved.low, - controls[i].saved.high); - } + if (controls[i].addr) + rdmsrl(controls[i].addr, controls[i].saved); } } @@ -204,19 +198,13 @@ static void nmi_restore_registers(struct op_msrs *msrs) unsigned int i; for (i = 0; i < nr_ctrls; ++i) { - if (controls[i].addr) { - wrmsr(controls[i].addr, - controls[i].saved.low, - controls[i].saved.high); - } + if (controls[i].addr) + wrmsrl(controls[i].addr, controls[i].saved); } for (i = 0; i < nr_ctrs; ++i) { - if (counters[i].addr) { - wrmsr(counters[i].addr, - counters[i].saved.low, - counters[i].saved.high); - } + if (counters[i].addr) + wrmsrl(counters[i].addr, counters[i].saved); } } diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 69f1eb46e1b3..fda52b4c1b95 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -15,14 +15,9 @@ #include #include -struct op_saved_msr { - unsigned int high; - unsigned int low; -}; - struct op_msr { - unsigned long addr; - struct op_saved_msr saved; + unsigned long addr; + u64 saved; }; struct op_msrs { -- cgit v1.2.3 From 1a245c45343651a87ff63afc5ddeb8e24d731835 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 5 Jun 2009 15:54:24 +0200 Subject: x86/oprofile: remove some local variables in MSR save/restore functions The patch removes some local variables in these functions. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 3b84b789de0b..80b63d5db509 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -71,18 +71,16 @@ static int profile_exceptions_notify(struct notifier_block *self, static void nmi_cpu_save_registers(struct op_msrs *msrs) { - unsigned int const nr_ctrs = model->num_counters; - unsigned int const nr_ctrls = model->num_controls; struct op_msr *counters = msrs->counters; struct op_msr *controls = msrs->controls; unsigned int i; - for (i = 0; i < nr_ctrs; ++i) { + for (i = 0; i < model->num_counters; ++i) { if (counters[i].addr) rdmsrl(counters[i].addr, counters[i].saved); } - for (i = 0; i < nr_ctrls; ++i) { + for (i = 0; i < model->num_controls; ++i) { if (controls[i].addr) rdmsrl(controls[i].addr, controls[i].saved); } @@ -191,18 +189,16 @@ static int nmi_setup(void) static void nmi_restore_registers(struct op_msrs *msrs) { - unsigned int const nr_ctrs = model->num_counters; - unsigned int const nr_ctrls = model->num_controls; struct op_msr *counters = msrs->counters; struct op_msr *controls = msrs->controls; unsigned int i; - for (i = 0; i < nr_ctrls; ++i) { + for (i = 0; i < model->num_controls; ++i) { if (controls[i].addr) wrmsrl(controls[i].addr, controls[i].saved); } - for (i = 0; i < nr_ctrs; ++i) { + for (i = 0; i < model->num_counters; ++i) { if (counters[i].addr) wrmsrl(counters[i].addr, counters[i].saved); } -- cgit v1.2.3 From c572ae4efd1b0a5cc76c5e6aae05c1b182b6a69c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 3 Jun 2009 20:10:39 +0200 Subject: x86/oprofile: use 64 bit values in IBS functions The IBS code internally uses 32 bit values (a low and a high value) to represent a 64 bit value. This patch changes this and now 64 bit values are used instead. 64 bit MSR functions can be used now. No functional changes. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 131 ++++++++++++++++++--------------------- 1 file changed, 61 insertions(+), 70 deletions(-) diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 9bf901762411..6493ef7ae9ad 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -35,16 +35,18 @@ static unsigned long reset_value[NUM_COUNTERS]; #ifdef CONFIG_OPROFILE_IBS /* IbsFetchCtl bits/masks */ -#define IBS_FETCH_HIGH_VALID_BIT (1UL << 17) /* bit 49 */ -#define IBS_FETCH_HIGH_ENABLE (1UL << 16) /* bit 48 */ -#define IBS_FETCH_LOW_MAX_CNT_MASK 0x0000FFFFUL /* MaxCnt mask */ +#define IBS_FETCH_RAND_EN (1ULL<<57) +#define IBS_FETCH_VAL (1ULL<<49) +#define IBS_FETCH_ENABLE (1ULL<<48) +#define IBS_FETCH_CNT_MASK 0xFFFF0000ULL /*IbsOpCtl bits */ -#define IBS_OP_LOW_VALID_BIT (1ULL<<18) /* bit 18 */ -#define IBS_OP_LOW_ENABLE (1ULL<<17) /* bit 17 */ +#define IBS_OP_CNT_CTL (1ULL<<19) +#define IBS_OP_VAL (1ULL<<18) +#define IBS_OP_ENABLE (1ULL<<17) -#define IBS_FETCH_SIZE 6 -#define IBS_OP_SIZE 12 +#define IBS_FETCH_SIZE 6 +#define IBS_OP_SIZE 12 static int has_ibs; /* AMD Family10h and later */ @@ -126,66 +128,63 @@ static inline int op_amd_handle_ibs(struct pt_regs * const regs, struct op_msrs const * const msrs) { - u32 low, high; - u64 msr; + u64 val, ctl; struct op_entry entry; if (!has_ibs) return 1; if (ibs_config.fetch_enabled) { - rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); - if (high & IBS_FETCH_HIGH_VALID_BIT) { - rdmsrl(MSR_AMD64_IBSFETCHLINAD, msr); - oprofile_write_reserve(&entry, regs, msr, + rdmsrl(MSR_AMD64_IBSFETCHCTL, ctl); + if (ctl & IBS_FETCH_VAL) { + rdmsrl(MSR_AMD64_IBSFETCHLINAD, val); + oprofile_write_reserve(&entry, regs, val, IBS_FETCH_CODE, IBS_FETCH_SIZE); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - oprofile_add_data(&entry, low); - oprofile_add_data(&entry, high); - rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); + oprofile_add_data(&entry, (u32)val); + oprofile_add_data(&entry, (u32)(val >> 32)); + oprofile_add_data(&entry, (u32)ctl); + oprofile_add_data(&entry, (u32)(ctl >> 32)); + rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, val); + oprofile_add_data(&entry, (u32)val); + oprofile_add_data(&entry, (u32)(val >> 32)); oprofile_write_commit(&entry); /* reenable the IRQ */ - high &= ~IBS_FETCH_HIGH_VALID_BIT; - high |= IBS_FETCH_HIGH_ENABLE; - low &= IBS_FETCH_LOW_MAX_CNT_MASK; - wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT_MASK); + ctl |= IBS_FETCH_ENABLE; + wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl); } } if (ibs_config.op_enabled) { - rdmsr(MSR_AMD64_IBSOPCTL, low, high); - if (low & IBS_OP_LOW_VALID_BIT) { - rdmsrl(MSR_AMD64_IBSOPRIP, msr); - oprofile_write_reserve(&entry, regs, msr, + rdmsrl(MSR_AMD64_IBSOPCTL, ctl); + if (ctl & IBS_OP_VAL) { + rdmsrl(MSR_AMD64_IBSOPRIP, val); + oprofile_write_reserve(&entry, regs, val, IBS_OP_CODE, IBS_OP_SIZE); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - rdmsrl(MSR_AMD64_IBSOPDATA, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - rdmsrl(MSR_AMD64_IBSOPDATA2, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - rdmsrl(MSR_AMD64_IBSOPDATA3, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - rdmsrl(MSR_AMD64_IBSDCLINAD, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - rdmsrl(MSR_AMD64_IBSDCPHYSAD, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); + oprofile_add_data(&entry, (u32)val); + oprofile_add_data(&entry, (u32)(val >> 32)); + rdmsrl(MSR_AMD64_IBSOPDATA, val); + oprofile_add_data(&entry, (u32)val); + oprofile_add_data(&entry, (u32)(val >> 32)); + rdmsrl(MSR_AMD64_IBSOPDATA2, val); + oprofile_add_data(&entry, (u32)val); + oprofile_add_data(&entry, (u32)(val >> 32)); + rdmsrl(MSR_AMD64_IBSOPDATA3, val); + oprofile_add_data(&entry, (u32)val); + oprofile_add_data(&entry, (u32)(val >> 32)); + rdmsrl(MSR_AMD64_IBSDCLINAD, val); + oprofile_add_data(&entry, (u32)val); + oprofile_add_data(&entry, (u32)(val >> 32)); + rdmsrl(MSR_AMD64_IBSDCPHYSAD, val); + oprofile_add_data(&entry, (u32)val); + oprofile_add_data(&entry, (u32)(val >> 32)); oprofile_write_commit(&entry); /* reenable the IRQ */ - high = 0; - low &= ~IBS_OP_LOW_VALID_BIT; - low |= IBS_OP_LOW_ENABLE; - wrmsr(MSR_AMD64_IBSOPCTL, low, high); + ctl &= ~IBS_OP_VAL & 0xFFFFFFFF; + ctl |= IBS_OP_ENABLE; + wrmsrl(MSR_AMD64_IBSOPCTL, ctl); } } @@ -194,39 +193,31 @@ op_amd_handle_ibs(struct pt_regs * const regs, static inline void op_amd_start_ibs(void) { - unsigned int low, high; + u64 val; if (has_ibs && ibs_config.fetch_enabled) { - low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; - high = ((ibs_config.rand_en & 0x1) << 25) /* bit 57 */ - + IBS_FETCH_HIGH_ENABLE; - wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + val = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; + val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0; + val |= IBS_FETCH_ENABLE; + wrmsrl(MSR_AMD64_IBSFETCHCTL, val); } if (has_ibs && ibs_config.op_enabled) { - low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) - + ((ibs_config.dispatched_ops & 0x1) << 19) /* bit 19 */ - + IBS_OP_LOW_ENABLE; - high = 0; - wrmsr(MSR_AMD64_IBSOPCTL, low, high); + val = (ibs_config.max_cnt_op >> 4) & 0xFFFF; + val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0; + val |= IBS_OP_ENABLE; + wrmsrl(MSR_AMD64_IBSOPCTL, val); } } static void op_amd_stop_ibs(void) { - unsigned int low, high; - if (has_ibs && ibs_config.fetch_enabled) { + if (has_ibs && ibs_config.fetch_enabled) /* clear max count and enable */ - low = 0; - high = 0; - wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); - } + wrmsrl(MSR_AMD64_IBSFETCHCTL, 0); - if (has_ibs && ibs_config.op_enabled) { + if (has_ibs && ibs_config.op_enabled) /* clear max count and enable */ - low = 0; - high = 0; - wrmsr(MSR_AMD64_IBSOPCTL, low, high); - } + wrmsrl(MSR_AMD64_IBSOPCTL, 0); } #else -- cgit v1.2.3 From 51563a0e5650d0d76539625388d72d62b34c726e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 3 Jun 2009 20:54:56 +0200 Subject: x86/oprofile: introduce oprofile_add_data64() The IBS implemention writes 64 bit register values to the cpu buffer by writing two 32 values using oprofile_add_data(). This patch introduces oprofile_add_data64() to write a single 64 bit value to the buffer. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 27 +++++++++------------------ drivers/oprofile/cpu_buffer.c | 15 +++++++++++++++ include/linux/oprofile.h | 1 + 3 files changed, 25 insertions(+), 18 deletions(-) diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 6493ef7ae9ad..cc930467575d 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -140,13 +140,10 @@ op_amd_handle_ibs(struct pt_regs * const regs, rdmsrl(MSR_AMD64_IBSFETCHLINAD, val); oprofile_write_reserve(&entry, regs, val, IBS_FETCH_CODE, IBS_FETCH_SIZE); - oprofile_add_data(&entry, (u32)val); - oprofile_add_data(&entry, (u32)(val >> 32)); - oprofile_add_data(&entry, (u32)ctl); - oprofile_add_data(&entry, (u32)(ctl >> 32)); + oprofile_add_data64(&entry, val); + oprofile_add_data64(&entry, ctl); rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, val); - oprofile_add_data(&entry, (u32)val); - oprofile_add_data(&entry, (u32)(val >> 32)); + oprofile_add_data64(&entry, val); oprofile_write_commit(&entry); /* reenable the IRQ */ @@ -162,23 +159,17 @@ op_amd_handle_ibs(struct pt_regs * const regs, rdmsrl(MSR_AMD64_IBSOPRIP, val); oprofile_write_reserve(&entry, regs, val, IBS_OP_CODE, IBS_OP_SIZE); - oprofile_add_data(&entry, (u32)val); - oprofile_add_data(&entry, (u32)(val >> 32)); + oprofile_add_data64(&entry, val); rdmsrl(MSR_AMD64_IBSOPDATA, val); - oprofile_add_data(&entry, (u32)val); - oprofile_add_data(&entry, (u32)(val >> 32)); + oprofile_add_data64(&entry, val); rdmsrl(MSR_AMD64_IBSOPDATA2, val); - oprofile_add_data(&entry, (u32)val); - oprofile_add_data(&entry, (u32)(val >> 32)); + oprofile_add_data64(&entry, val); rdmsrl(MSR_AMD64_IBSOPDATA3, val); - oprofile_add_data(&entry, (u32)val); - oprofile_add_data(&entry, (u32)(val >> 32)); + oprofile_add_data64(&entry, val); rdmsrl(MSR_AMD64_IBSDCLINAD, val); - oprofile_add_data(&entry, (u32)val); - oprofile_add_data(&entry, (u32)(val >> 32)); + oprofile_add_data64(&entry, val); rdmsrl(MSR_AMD64_IBSDCPHYSAD, val); - oprofile_add_data(&entry, (u32)val); - oprofile_add_data(&entry, (u32)(val >> 32)); + oprofile_add_data64(&entry, val); oprofile_write_commit(&entry); /* reenable the IRQ */ diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c index 50640cc5eef2..a7aae24f2889 100644 --- a/drivers/oprofile/cpu_buffer.c +++ b/drivers/oprofile/cpu_buffer.c @@ -406,6 +406,21 @@ int oprofile_add_data(struct op_entry *entry, unsigned long val) return op_cpu_buffer_add_data(entry, val); } +int oprofile_add_data64(struct op_entry *entry, u64 val) +{ + if (!entry->event) + return 0; + if (op_cpu_buffer_get_size(entry) < 2) + /* + * the function returns 0 to indicate a too small + * buffer, even if there is some space left + */ + return 0; + if (!op_cpu_buffer_add_data(entry, (u32)val)) + return 0; + return op_cpu_buffer_add_data(entry, (u32)(val >> 32)); +} + int oprofile_write_commit(struct op_entry *entry) { if (!entry->event) diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h index dbbe2dbc4418..d68d2ed94f15 100644 --- a/include/linux/oprofile.h +++ b/include/linux/oprofile.h @@ -179,6 +179,7 @@ void oprofile_write_reserve(struct op_entry *entry, struct pt_regs * const regs, unsigned long pc, int code, int size); int oprofile_add_data(struct op_entry *entry, unsigned long val); +int oprofile_add_data64(struct op_entry *entry, u64 val); int oprofile_write_commit(struct op_entry *entry); #endif /* OPROFILE_H */ -- cgit v1.2.3 From 1cc4ce6f5f89cdc355013aa43f06a14a015766d1 Mon Sep 17 00:00:00 2001 From: Maynard Johnson Date: Wed, 27 May 2009 10:15:08 -0500 Subject: oprofile: reset bt_lost_no_mapping with other stats The bt_lost_no_mapping is not getting reset at the start of a profiling run, thus the oprofiled.log shows erroneous values for this statistic. The attached patch fixes this problem. Signed-off-by: Maynard Johnson Signed-off-by: Robert Richter --- drivers/oprofile/oprofile_stats.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/oprofile/oprofile_stats.c b/drivers/oprofile/oprofile_stats.c index e1f6ce03705e..efefae539a59 100644 --- a/drivers/oprofile/oprofile_stats.c +++ b/drivers/oprofile/oprofile_stats.c @@ -33,6 +33,7 @@ void oprofile_reset_stats(void) atomic_set(&oprofile_stats.sample_lost_no_mm, 0); atomic_set(&oprofile_stats.sample_lost_no_mapping, 0); atomic_set(&oprofile_stats.event_lost_overflow, 0); + atomic_set(&oprofile_stats.bt_lost_no_mapping,0); } -- cgit v1.2.3 From 802070f5474af1a49435a9528aede47bb18abd47 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 12 Jun 2009 18:32:07 +0200 Subject: x86/oprofile: fix initialization of arch_perfmon for core_i7 Commit: e419294 x86/oprofile: moving arch_perfmon counter setup to op_x86_model_spec.init introduced a bug in the initialization of core_i7 leading to the incorrect model setup to &op_ppro_spec. This patch fixes this. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 7826dfcc8428..28ee490c1b80 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -406,6 +406,7 @@ module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0); static int __init ppro_init(char **cpu_type) { __u8 cpu_model = boot_cpu_data.x86_model; + struct op_x86_model_spec const *spec = &op_ppro_spec; /* default */ if (force_arch_perfmon && cpu_has_arch_perfmon) return 0; @@ -432,7 +433,7 @@ static int __init ppro_init(char **cpu_type) *cpu_type = "i386/core_2"; break; case 26: - model = &op_arch_perfmon_spec; + spec = &op_arch_perfmon_spec; *cpu_type = "i386/core_i7"; break; case 28: @@ -443,7 +444,7 @@ static int __init ppro_init(char **cpu_type) return 0; } - model = &op_ppro_spec; + model = spec; return 1; } -- cgit v1.2.3 From c64b04fe6e0cb7c78e01751a44ef56cf20344e87 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sun, 14 Jun 2009 00:59:50 +0530 Subject: x86, cpu: cpu/proc.c display cache alignment and address sizes for 32 bit 32 bits can also access x86_cache_alignment, x86_phys_bits and x86_virt_bits, make them available to user space just as on 64 bits. Signed-off-by: Jaswinder Singh Rajput LKML-Reference: <1244921390.11733.30.camel@ht.satnam> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/proc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index d5e30397246b..f82706a3901d 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -116,11 +116,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); #endif seq_printf(m, "clflush size\t: %u\n", c->x86_clflush_size); -#ifdef CONFIG_X86_64 seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", c->x86_phys_bits, c->x86_virt_bits); -#endif seq_printf(m, "power management:"); for (i = 0; i < 32; i++) { -- cgit v1.2.3 From ca94c442535a44d508c99a77e54f21a59f4fc462 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 15 Jun 2009 17:17:47 +0200 Subject: sched: Introduce SCHED_RESET_ON_FORK scheduling policy flag This patch introduces a new flag SCHED_RESET_ON_FORK which can be passed to the kernel via sched_setscheduler(), ORed in the policy parameter. If set this will make sure that when the process forks a) the scheduling priority is reset to DEFAULT_PRIO if it was higher and b) the scheduling policy is reset to SCHED_NORMAL if it was either SCHED_FIFO or SCHED_RR. Why have this? Currently, if a process is real-time scheduled this will 'leak' to all its child processes. For security reasons it is often (always?) a good idea to make sure that if a process acquires RT scheduling this is confined to this process and only this process. More specifically this makes the per-process resource limit RLIMIT_RTTIME useful for security purposes, because it makes it impossible to use a fork bomb to circumvent the per-process RLIMIT_RTTIME accounting. This feature is also useful for tools like 'renice' which can then change the nice level of a process without having this spill to all its child processes. Why expose this via sched_setscheduler() and not other syscalls such as prctl() or sched_setparam()? prctl() does not take a pid parameter. Due to that it would be impossible to modify this flag for other processes than the current one. The struct passed to sched_setparam() can unfortunately not be extended without breaking compatibility, since sched_setparam() lacks a size parameter. How to use this from userspace? In your RT program simply replace this: sched_setscheduler(pid, SCHED_FIFO, ¶m); by this: sched_setscheduler(pid, SCHED_FIFO|SCHED_RESET_ON_FORK, ¶m); Signed-off-by: Lennart Poettering Acked-by: Peter Zijlstra LKML-Reference: <20090615152714.GA29092@tango.0pointer.de> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 6 ++++++ kernel/sched.c | 49 ++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 46 insertions(+), 9 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 4896fdfec913..d4a2c6662f7d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -38,6 +38,8 @@ #define SCHED_BATCH 3 /* SCHED_ISO: reserved but not implemented yet */ #define SCHED_IDLE 5 +/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ +#define SCHED_RESET_ON_FORK 0x40000000 #ifdef __KERNEL__ @@ -1209,6 +1211,10 @@ struct task_struct { unsigned did_exec:1; unsigned in_execve:1; /* Tell the LSMs that the process is doing an * execve */ + + /* Revert to default priority/policy when forking */ + unsigned sched_reset_on_fork:1; + pid_t pid; pid_t tgid; diff --git a/kernel/sched.c b/kernel/sched.c index 8ec9d13140be..32e6ede85255 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2613,12 +2613,28 @@ void sched_fork(struct task_struct *p, int clone_flags) set_task_cpu(p, cpu); /* - * Make sure we do not leak PI boosting priority to the child: + * Revert to default priority/policy on fork if requested. Make sure we + * do not leak PI boosting priority to the child. */ - p->prio = current->normal_prio; + if (current->sched_reset_on_fork && + (p->policy == SCHED_FIFO || p->policy == SCHED_RR)) + p->policy = SCHED_NORMAL; + + if (current->sched_reset_on_fork && + (current->normal_prio < DEFAULT_PRIO)) + p->prio = DEFAULT_PRIO; + else + p->prio = current->normal_prio; + if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; + /* + * We don't need the reset flag anymore after the fork. It has + * fulfilled its duty: + */ + p->sched_reset_on_fork = 0; + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -6094,17 +6110,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy, unsigned long flags; const struct sched_class *prev_class = p->sched_class; struct rq *rq; + int reset_on_fork; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); recheck: /* double check policy once rq lock held */ - if (policy < 0) + if (policy < 0) { + reset_on_fork = p->sched_reset_on_fork; policy = oldpolicy = p->policy; - else if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL && policy != SCHED_BATCH && - policy != SCHED_IDLE) - return -EINVAL; + } else { + reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); + policy &= ~SCHED_RESET_ON_FORK; + + if (policy != SCHED_FIFO && policy != SCHED_RR && + policy != SCHED_NORMAL && policy != SCHED_BATCH && + policy != SCHED_IDLE) + return -EINVAL; + } + /* * Valid priorities for SCHED_FIFO and SCHED_RR are * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, @@ -6148,6 +6172,10 @@ recheck: /* can't change other user's priorities */ if (!check_same_owner(p)) return -EPERM; + + /* Normal users shall not reset the sched_reset_on_fork flag */ + if (p->sched_reset_on_fork && !reset_on_fork) + return -EPERM; } if (user) { @@ -6191,6 +6219,8 @@ recheck: if (running) p->sched_class->put_prev_task(rq, p); + p->sched_reset_on_fork = reset_on_fork; + oldprio = p->prio; __setscheduler(rq, p, policy, param->sched_priority); @@ -6307,14 +6337,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) if (p) { retval = security_task_getscheduler(p); if (!retval) - retval = p->policy; + retval = p->policy + | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); } read_unlock(&tasklist_lock); return retval; } /** - * sys_sched_getscheduler - get the RT priority of a thread + * sys_sched_getparam - get the RT priority of a thread * @pid: the pid in question. * @param: structure containing the RT priority. */ -- cgit v1.2.3 From b9dc29e72fd3dc2a739ce8eafd958220d0745734 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 17 Jun 2009 10:46:01 +0200 Subject: sched: Clean up SCHED_RESET_ON_FORK Make SCHED_RESET_ON_FORK sched_fork() bits a self-contained unlikely code path. Signed-off-by: Mike Galbraith Acked-by: Lennart Poettering Cc: Peter Zijlstra LKML-Reference: <1245228361.18329.6.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 32e6ede85255..50e4e3d15e83 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2613,28 +2613,30 @@ void sched_fork(struct task_struct *p, int clone_flags) set_task_cpu(p, cpu); /* - * Revert to default priority/policy on fork if requested. Make sure we - * do not leak PI boosting priority to the child. + * Make sure we do not leak PI boosting priority to the child. */ - if (current->sched_reset_on_fork && - (p->policy == SCHED_FIFO || p->policy == SCHED_RR)) - p->policy = SCHED_NORMAL; + p->prio = current->normal_prio; - if (current->sched_reset_on_fork && - (current->normal_prio < DEFAULT_PRIO)) - p->prio = DEFAULT_PRIO; - else - p->prio = current->normal_prio; + /* + * Revert to default priority/policy on fork if requested. + */ + if (unlikely(p->sched_reset_on_fork)) { + if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) + p->policy = SCHED_NORMAL; + + if (p->normal_prio < DEFAULT_PRIO) + p->prio = DEFAULT_PRIO; + + /* + * We don't need the reset flag anymore after the fork. It has + * fulfilled its duty: + */ + p->sched_reset_on_fork = 0; + } if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; - /* - * We don't need the reset flag anymore after the fork. It has - * fulfilled its duty: - */ - p->sched_reset_on_fork = 0; - #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); -- cgit v1.2.3 From 6c697bdf08a09ce461e305a22362973036e95db3 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 17 Jun 2009 10:48:02 +0200 Subject: sched: Add SCHED_RESET_ON_FORK functionality for nice < 0 tasks Signed-off-by: Mike Galbraith Acked-by: Lennart Poettering Cc: Peter Zijlstra LKML-Reference: <1245228482.27326.1.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/sched.c b/kernel/sched.c index 50e4e3d15e83..34f94240642f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2627,6 +2627,11 @@ void sched_fork(struct task_struct *p, int clone_flags) if (p->normal_prio < DEFAULT_PRIO) p->prio = DEFAULT_PRIO; + if (PRIO_TO_NICE(p->static_prio) < 0) { + p->static_prio = NICE_TO_PRIO(0); + set_load_weight(p); + } + /* * We don't need the reset flag anymore after the fork. It has * fulfilled its duty: -- cgit v1.2.3 From 4555835b707d5c778ee1c9076670bc99b1eeaf61 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 17 Jun 2009 14:44:19 +0530 Subject: x86: hw_breakpoint.c arch_check_va_in_kernelspace and hw_breakpoint_handler should be static arch_check_va_in_kernelspace() and hw_breakpoint_handler() is used only by same file so it should be static. Also fixed non-ANSI function declaration of function 'arch_uninstall_thread_hw_breakpoint' Fixed following sparse warnings : arch/x86/kernel/hw_breakpoint.c:124:42: warning: non-ANSI function declaration of function 'arch_uninstall_thread_hw_breakpoint' arch/x86/kernel/hw_breakpoint.c:169:5: warning: symbol 'arch_check_va_in_kernelspace' was not declared. Should it be static? arch/x86/kernel/hw_breakpoint.c:313:15: warning: symbol 'hw_breakpoint_handler' was not declared. Should it be static? Signed-off-by: Jaswinder Singh Rajput Cc: Alan Stern Cc: "K.Prasad" Cc: Frederic Weisbecker LKML-Reference: <1245230059.2662.4.camel@ht.satnam> Signed-off-by: Ingo Molnar --- arch/x86/kernel/hw_breakpoint.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 51d959528b1d..9316a9de4de3 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -121,7 +121,7 @@ void arch_install_thread_hw_breakpoint(struct task_struct *tsk) /* * Install the debug register values for just the kernel, no thread. */ -void arch_uninstall_thread_hw_breakpoint() +void arch_uninstall_thread_hw_breakpoint(void) { /* Clear the user-space portion of debugreg7 by setting only kdr7 */ set_debugreg(kdr7, 7); @@ -166,7 +166,7 @@ int arch_check_va_in_userspace(unsigned long va, u8 hbp_len) /* * Check for virtual address in kernel space. */ -int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) +static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) { unsigned int len; @@ -310,7 +310,7 @@ void arch_flush_thread_hw_breakpoint(struct task_struct *tsk) * NOTIFY_STOP returned for all other cases * */ -int __kprobes hw_breakpoint_handler(struct die_args *args) +static int __kprobes hw_breakpoint_handler(struct die_args *args) { int i, cpu, rc = NOTIFY_STOP; struct hw_breakpoint *bp; -- cgit v1.2.3 From ac5672f82c39ff2f8dce81bf3e68b1dfc41f366f Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 14 Apr 2009 14:29:44 -0700 Subject: x86/paravirt: split paravirt definitions into paravirt_types.h Split the monolithic asm/paravirt.h into separate paravirt.h (inlines and other "active" definitions), and paravirt_types.h (types, constants and other "passive" definitions). This makes it easier to use the type/constant definitions without pulling in everything else and causing circular dependency problems. [ Impact: cleanup ] Signed-off-by: Jeremy Fitzhardinge --- arch/x86/include/asm/paravirt.h | 711 +-------------------------------- arch/x86/include/asm/paravirt_types.h | 720 ++++++++++++++++++++++++++++++++++ 2 files changed, 721 insertions(+), 710 deletions(-) create mode 100644 arch/x86/include/asm/paravirt_types.h diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 4fb37c8a0832..6a07af432c81 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -7,689 +7,11 @@ #include #include -/* Bitmask of what can be clobbered: usually at least eax. */ -#define CLBR_NONE 0 -#define CLBR_EAX (1 << 0) -#define CLBR_ECX (1 << 1) -#define CLBR_EDX (1 << 2) -#define CLBR_EDI (1 << 3) - -#ifdef CONFIG_X86_32 -/* CLBR_ANY should match all regs platform has. For i386, that's just it */ -#define CLBR_ANY ((1 << 4) - 1) - -#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX) -#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX) -#define CLBR_SCRATCH (0) -#else -#define CLBR_RAX CLBR_EAX -#define CLBR_RCX CLBR_ECX -#define CLBR_RDX CLBR_EDX -#define CLBR_RDI CLBR_EDI -#define CLBR_RSI (1 << 4) -#define CLBR_R8 (1 << 5) -#define CLBR_R9 (1 << 6) -#define CLBR_R10 (1 << 7) -#define CLBR_R11 (1 << 8) - -#define CLBR_ANY ((1 << 9) - 1) - -#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \ - CLBR_RCX | CLBR_R8 | CLBR_R9) -#define CLBR_RET_REG (CLBR_RAX) -#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11) - -#include -#endif /* X86_64 */ - -#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG) +#include #ifndef __ASSEMBLY__ #include #include -#include -#include - -struct page; -struct thread_struct; -struct desc_ptr; -struct tss_struct; -struct mm_struct; -struct desc_struct; -struct task_struct; - -/* - * Wrapper type for pointers to code which uses the non-standard - * calling convention. See PV_CALL_SAVE_REGS_THUNK below. - */ -struct paravirt_callee_save { - void *func; -}; - -/* general info */ -struct pv_info { - unsigned int kernel_rpl; - int shared_kernel_pmd; - int paravirt_enabled; - const char *name; -}; - -struct pv_init_ops { - /* - * Patch may replace one of the defined code sequences with - * arbitrary code, subject to the same register constraints. - * This generally means the code is not free to clobber any - * registers other than EAX. The patch function should return - * the number of bytes of code generated, as we nop pad the - * rest in generic code. - */ - unsigned (*patch)(u8 type, u16 clobber, void *insnbuf, - unsigned long addr, unsigned len); - - /* Basic arch-specific setup */ - void (*arch_setup)(void); - char *(*memory_setup)(void); - void (*post_allocator_init)(void); - - /* Print a banner to identify the environment */ - void (*banner)(void); -}; - - -struct pv_lazy_ops { - /* Set deferred update mode, used for batching operations. */ - void (*enter)(void); - void (*leave)(void); -}; - -struct pv_time_ops { - void (*time_init)(void); - - /* Set and set time of day */ - unsigned long (*get_wallclock)(void); - int (*set_wallclock)(unsigned long); - - unsigned long long (*sched_clock)(void); - unsigned long (*get_tsc_khz)(void); -}; - -struct pv_cpu_ops { - /* hooks for various privileged instructions */ - unsigned long (*get_debugreg)(int regno); - void (*set_debugreg)(int regno, unsigned long value); - - void (*clts)(void); - - unsigned long (*read_cr0)(void); - void (*write_cr0)(unsigned long); - - unsigned long (*read_cr4_safe)(void); - unsigned long (*read_cr4)(void); - void (*write_cr4)(unsigned long); - -#ifdef CONFIG_X86_64 - unsigned long (*read_cr8)(void); - void (*write_cr8)(unsigned long); -#endif - - /* Segment descriptor handling */ - void (*load_tr_desc)(void); - void (*load_gdt)(const struct desc_ptr *); - void (*load_idt)(const struct desc_ptr *); - void (*store_gdt)(struct desc_ptr *); - void (*store_idt)(struct desc_ptr *); - void (*set_ldt)(const void *desc, unsigned entries); - unsigned long (*store_tr)(void); - void (*load_tls)(struct thread_struct *t, unsigned int cpu); -#ifdef CONFIG_X86_64 - void (*load_gs_index)(unsigned int idx); -#endif - void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum, - const void *desc); - void (*write_gdt_entry)(struct desc_struct *, - int entrynum, const void *desc, int size); - void (*write_idt_entry)(gate_desc *, - int entrynum, const gate_desc *gate); - void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries); - void (*free_ldt)(struct desc_struct *ldt, unsigned entries); - - void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); - - void (*set_iopl_mask)(unsigned mask); - - void (*wbinvd)(void); - void (*io_delay)(void); - - /* cpuid emulation, mostly so that caps bits can be disabled */ - void (*cpuid)(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx); - - /* MSR, PMC and TSR operations. - err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ - u64 (*read_msr_amd)(unsigned int msr, int *err); - u64 (*read_msr)(unsigned int msr, int *err); - int (*write_msr)(unsigned int msr, unsigned low, unsigned high); - - u64 (*read_tsc)(void); - u64 (*read_pmc)(int counter); - unsigned long long (*read_tscp)(unsigned int *aux); - - /* - * Atomically enable interrupts and return to userspace. This - * is only ever used to return to 32-bit processes; in a - * 64-bit kernel, it's used for 32-on-64 compat processes, but - * never native 64-bit processes. (Jump, not call.) - */ - void (*irq_enable_sysexit)(void); - - /* - * Switch to usermode gs and return to 64-bit usermode using - * sysret. Only used in 64-bit kernels to return to 64-bit - * processes. Usermode register state, including %rsp, must - * already be restored. - */ - void (*usergs_sysret64)(void); - - /* - * Switch to usermode gs and return to 32-bit usermode using - * sysret. Used to return to 32-on-64 compat processes. - * Other usermode register state, including %esp, must already - * be restored. - */ - void (*usergs_sysret32)(void); - - /* Normal iret. Jump to this with the standard iret stack - frame set up. */ - void (*iret)(void); - - void (*swapgs)(void); - - void (*start_context_switch)(struct task_struct *prev); - void (*end_context_switch)(struct task_struct *next); -}; - -struct pv_irq_ops { - void (*init_IRQ)(void); - - /* - * Get/set interrupt state. save_fl and restore_fl are only - * expected to use X86_EFLAGS_IF; all other bits - * returned from save_fl are undefined, and may be ignored by - * restore_fl. - * - * NOTE: These functions callers expect the callee to preserve - * more registers than the standard C calling convention. - */ - struct paravirt_callee_save save_fl; - struct paravirt_callee_save restore_fl; - struct paravirt_callee_save irq_disable; - struct paravirt_callee_save irq_enable; - - void (*safe_halt)(void); - void (*halt)(void); - -#ifdef CONFIG_X86_64 - void (*adjust_exception_frame)(void); -#endif -}; - -struct pv_apic_ops { -#ifdef CONFIG_X86_LOCAL_APIC - void (*setup_boot_clock)(void); - void (*setup_secondary_clock)(void); - - void (*startup_ipi_hook)(int phys_apicid, - unsigned long start_eip, - unsigned long start_esp); -#endif -}; - -struct pv_mmu_ops { - /* - * Called before/after init_mm pagetable setup. setup_start - * may reset %cr3, and may pre-install parts of the pagetable; - * pagetable setup is expected to preserve any existing - * mapping. - */ - void (*pagetable_setup_start)(pgd_t *pgd_base); - void (*pagetable_setup_done)(pgd_t *pgd_base); - - unsigned long (*read_cr2)(void); - void (*write_cr2)(unsigned long); - - unsigned long (*read_cr3)(void); - void (*write_cr3)(unsigned long); - - /* - * Hooks for intercepting the creation/use/destruction of an - * mm_struct. - */ - void (*activate_mm)(struct mm_struct *prev, - struct mm_struct *next); - void (*dup_mmap)(struct mm_struct *oldmm, - struct mm_struct *mm); - void (*exit_mmap)(struct mm_struct *mm); - - - /* TLB operations */ - void (*flush_tlb_user)(void); - void (*flush_tlb_kernel)(void); - void (*flush_tlb_single)(unsigned long addr); - void (*flush_tlb_others)(const struct cpumask *cpus, - struct mm_struct *mm, - unsigned long va); - - /* Hooks for allocating and freeing a pagetable top-level */ - int (*pgd_alloc)(struct mm_struct *mm); - void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd); - - /* - * Hooks for allocating/releasing pagetable pages when they're - * attached to a pagetable - */ - void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn); - void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn); - void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count); - void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn); - void (*release_pte)(unsigned long pfn); - void (*release_pmd)(unsigned long pfn); - void (*release_pud)(unsigned long pfn); - - /* Pagetable manipulation functions */ - void (*set_pte)(pte_t *ptep, pte_t pteval); - void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval); - void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); - void (*pte_update)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); - void (*pte_update_defer)(struct mm_struct *mm, - unsigned long addr, pte_t *ptep); - - pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); - void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte); - - struct paravirt_callee_save pte_val; - struct paravirt_callee_save make_pte; - - struct paravirt_callee_save pgd_val; - struct paravirt_callee_save make_pgd; - -#if PAGETABLE_LEVELS >= 3 -#ifdef CONFIG_X86_PAE - void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); - void (*pte_clear)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); - void (*pmd_clear)(pmd_t *pmdp); - -#endif /* CONFIG_X86_PAE */ - - void (*set_pud)(pud_t *pudp, pud_t pudval); - - struct paravirt_callee_save pmd_val; - struct paravirt_callee_save make_pmd; - -#if PAGETABLE_LEVELS == 4 - struct paravirt_callee_save pud_val; - struct paravirt_callee_save make_pud; - - void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); -#endif /* PAGETABLE_LEVELS == 4 */ -#endif /* PAGETABLE_LEVELS >= 3 */ - -#ifdef CONFIG_HIGHPTE - void *(*kmap_atomic_pte)(struct page *page, enum km_type type); -#endif - - struct pv_lazy_ops lazy_mode; - - /* dom0 ops */ - - /* Sometimes the physical address is a pfn, and sometimes its - an mfn. We can tell which is which from the index. */ - void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx, - phys_addr_t phys, pgprot_t flags); -}; - -struct raw_spinlock; -struct pv_lock_ops { - int (*spin_is_locked)(struct raw_spinlock *lock); - int (*spin_is_contended)(struct raw_spinlock *lock); - void (*spin_lock)(struct raw_spinlock *lock); - void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags); - int (*spin_trylock)(struct raw_spinlock *lock); - void (*spin_unlock)(struct raw_spinlock *lock); -}; - -/* This contains all the paravirt structures: we get a convenient - * number for each function using the offset which we use to indicate - * what to patch. */ -struct paravirt_patch_template { - struct pv_init_ops pv_init_ops; - struct pv_time_ops pv_time_ops; - struct pv_cpu_ops pv_cpu_ops; - struct pv_irq_ops pv_irq_ops; - struct pv_apic_ops pv_apic_ops; - struct pv_mmu_ops pv_mmu_ops; - struct pv_lock_ops pv_lock_ops; -}; - -extern struct pv_info pv_info; -extern struct pv_init_ops pv_init_ops; -extern struct pv_time_ops pv_time_ops; -extern struct pv_cpu_ops pv_cpu_ops; -extern struct pv_irq_ops pv_irq_ops; -extern struct pv_apic_ops pv_apic_ops; -extern struct pv_mmu_ops pv_mmu_ops; -extern struct pv_lock_ops pv_lock_ops; - -#define PARAVIRT_PATCH(x) \ - (offsetof(struct paravirt_patch_template, x) / sizeof(void *)) - -#define paravirt_type(op) \ - [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ - [paravirt_opptr] "i" (&(op)) -#define paravirt_clobber(clobber) \ - [paravirt_clobber] "i" (clobber) - -/* - * Generate some code, and mark it as patchable by the - * apply_paravirt() alternate instruction patcher. - */ -#define _paravirt_alt(insn_string, type, clobber) \ - "771:\n\t" insn_string "\n" "772:\n" \ - ".pushsection .parainstructions,\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR " 771b\n" \ - " .byte " type "\n" \ - " .byte 772b-771b\n" \ - " .short " clobber "\n" \ - ".popsection\n" - -/* Generate patchable code, with the default asm parameters. */ -#define paravirt_alt(insn_string) \ - _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") - -/* Simple instruction patching code. */ -#define DEF_NATIVE(ops, name, code) \ - extern const char start_##ops##_##name[], end_##ops##_##name[]; \ - asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") - -unsigned paravirt_patch_nop(void); -unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len); -unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len); -unsigned paravirt_patch_ignore(unsigned len); -unsigned paravirt_patch_call(void *insnbuf, - const void *target, u16 tgt_clobbers, - unsigned long addr, u16 site_clobbers, - unsigned len); -unsigned paravirt_patch_jmp(void *insnbuf, const void *target, - unsigned long addr, unsigned len); -unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, - unsigned long addr, unsigned len); - -unsigned paravirt_patch_insns(void *insnbuf, unsigned len, - const char *start, const char *end); - -unsigned native_patch(u8 type, u16 clobbers, void *ibuf, - unsigned long addr, unsigned len); - -int paravirt_disable_iospace(void); - -/* - * This generates an indirect call based on the operation type number. - * The type number, computed in PARAVIRT_PATCH, is derived from the - * offset into the paravirt_patch_template structure, and can therefore be - * freely converted back into a structure offset. - */ -#define PARAVIRT_CALL "call *%c[paravirt_opptr];" - -/* - * These macros are intended to wrap calls through one of the paravirt - * ops structs, so that they can be later identified and patched at - * runtime. - * - * Normally, a call to a pv_op function is a simple indirect call: - * (pv_op_struct.operations)(args...). - * - * Unfortunately, this is a relatively slow operation for modern CPUs, - * because it cannot necessarily determine what the destination - * address is. In this case, the address is a runtime constant, so at - * the very least we can patch the call to e a simple direct call, or - * ideally, patch an inline implementation into the callsite. (Direct - * calls are essentially free, because the call and return addresses - * are completely predictable.) - * - * For i386, these macros rely on the standard gcc "regparm(3)" calling - * convention, in which the first three arguments are placed in %eax, - * %edx, %ecx (in that order), and the remaining arguments are placed - * on the stack. All caller-save registers (eax,edx,ecx) are expected - * to be modified (either clobbered or used for return values). - * X86_64, on the other hand, already specifies a register-based calling - * conventions, returning at %rax, with parameteres going on %rdi, %rsi, - * %rdx, and %rcx. Note that for this reason, x86_64 does not need any - * special handling for dealing with 4 arguments, unlike i386. - * However, x86_64 also have to clobber all caller saved registers, which - * unfortunately, are quite a bit (r8 - r11) - * - * The call instruction itself is marked by placing its start address - * and size into the .parainstructions section, so that - * apply_paravirt() in arch/i386/kernel/alternative.c can do the - * appropriate patching under the control of the backend pv_init_ops - * implementation. - * - * Unfortunately there's no way to get gcc to generate the args setup - * for the call, and then allow the call itself to be generated by an - * inline asm. Because of this, we must do the complete arg setup and - * return value handling from within these macros. This is fairly - * cumbersome. - * - * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments. - * It could be extended to more arguments, but there would be little - * to be gained from that. For each number of arguments, there are - * the two VCALL and CALL variants for void and non-void functions. - * - * When there is a return value, the invoker of the macro must specify - * the return type. The macro then uses sizeof() on that type to - * determine whether its a 32 or 64 bit value, and places the return - * in the right register(s) (just %eax for 32-bit, and %edx:%eax for - * 64-bit). For x86_64 machines, it just returns at %rax regardless of - * the return value size. - * - * 64-bit arguments are passed as a pair of adjacent 32-bit arguments - * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments - * in low,high order - * - * Small structures are passed and returned in registers. The macro - * calling convention can't directly deal with this, so the wrapper - * functions must do this. - * - * These PVOP_* macros are only defined within this header. This - * means that all uses must be wrapped in inline functions. This also - * makes sure the incoming and outgoing types are always correct. - */ -#ifdef CONFIG_X86_32 -#define PVOP_VCALL_ARGS \ - unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx -#define PVOP_CALL_ARGS PVOP_VCALL_ARGS - -#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x)) -#define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x)) -#define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x)) - -#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \ - "=c" (__ecx) -#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS - -#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), "=d" (__edx) -#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS - -#define EXTRA_CLOBBERS -#define VEXTRA_CLOBBERS -#else /* CONFIG_X86_64 */ -#define PVOP_VCALL_ARGS \ - unsigned long __edi = __edi, __esi = __esi, \ - __edx = __edx, __ecx = __ecx -#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax - -#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) -#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x)) -#define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x)) -#define PVOP_CALL_ARG4(x) "c" ((unsigned long)(x)) - -#define PVOP_VCALL_CLOBBERS "=D" (__edi), \ - "=S" (__esi), "=d" (__edx), \ - "=c" (__ecx) -#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) - -#define PVOP_VCALLEE_CLOBBERS "=a" (__eax) -#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS - -#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11" -#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11" -#endif /* CONFIG_X86_32 */ - -#ifdef CONFIG_PARAVIRT_DEBUG -#define PVOP_TEST_NULL(op) BUG_ON(op == NULL) -#else -#define PVOP_TEST_NULL(op) ((void)op) -#endif - -#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \ - pre, post, ...) \ - ({ \ - rettype __ret; \ - PVOP_CALL_ARGS; \ - PVOP_TEST_NULL(op); \ - /* This is 32-bit specific, but is okay in 64-bit */ \ - /* since this condition will never hold */ \ - if (sizeof(rettype) > sizeof(unsigned long)) { \ - asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ - post \ - : call_clbr \ - : paravirt_type(op), \ - paravirt_clobber(clbr), \ - ##__VA_ARGS__ \ - : "memory", "cc" extra_clbr); \ - __ret = (rettype)((((u64)__edx) << 32) | __eax); \ - } else { \ - asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ - post \ - : call_clbr \ - : paravirt_type(op), \ - paravirt_clobber(clbr), \ - ##__VA_ARGS__ \ - : "memory", "cc" extra_clbr); \ - __ret = (rettype)__eax; \ - } \ - __ret; \ - }) - -#define __PVOP_CALL(rettype, op, pre, post, ...) \ - ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \ - EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__) - -#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \ - ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ - PVOP_CALLEE_CLOBBERS, , \ - pre, post, ##__VA_ARGS__) - - -#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \ - ({ \ - PVOP_VCALL_ARGS; \ - PVOP_TEST_NULL(op); \ - asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ - post \ - : call_clbr \ - : paravirt_type(op), \ - paravirt_clobber(clbr), \ - ##__VA_ARGS__ \ - : "memory", "cc" extra_clbr); \ - }) - -#define __PVOP_VCALL(op, pre, post, ...) \ - ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \ - VEXTRA_CLOBBERS, \ - pre, post, ##__VA_ARGS__) - -#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \ - ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ - PVOP_VCALLEE_CLOBBERS, , \ - pre, post, ##__VA_ARGS__) - - - -#define PVOP_CALL0(rettype, op) \ - __PVOP_CALL(rettype, op, "", "") -#define PVOP_VCALL0(op) \ - __PVOP_VCALL(op, "", "") - -#define PVOP_CALLEE0(rettype, op) \ - __PVOP_CALLEESAVE(rettype, op, "", "") -#define PVOP_VCALLEE0(op) \ - __PVOP_VCALLEESAVE(op, "", "") - - -#define PVOP_CALL1(rettype, op, arg1) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) -#define PVOP_VCALL1(op, arg1) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1)) - -#define PVOP_CALLEE1(rettype, op, arg1) \ - __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) -#define PVOP_VCALLEE1(op, arg1) \ - __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1)) - - -#define PVOP_CALL2(rettype, op, arg1, arg2) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) -#define PVOP_VCALL2(op, arg1, arg2) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) - -#define PVOP_CALLEE2(rettype, op, arg1, arg2) \ - __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) -#define PVOP_VCALLEE2(op, arg1, arg2) \ - __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) - - -#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) -#define PVOP_VCALL3(op, arg1, arg2, arg3) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) - -/* This is the only difference in x86_64. We can make it much simpler */ -#ifdef CONFIG_X86_32 -#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ - __PVOP_CALL(rettype, op, \ - "push %[_arg4];", "lea 4(%%esp),%%esp;", \ - PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ - PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4))) -#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ - __PVOP_VCALL(op, \ - "push %[_arg4];", "lea 4(%%esp),%%esp;", \ - "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ - "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) -#else -#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ - __PVOP_CALL(rettype, op, "", "", \ - PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ - PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) -#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ - __PVOP_VCALL(op, "", "", \ - PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ - PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) -#endif static inline int paravirt_enabled(void) { @@ -1393,20 +715,6 @@ static inline void pmd_clear(pmd_t *pmdp) } #endif /* CONFIG_X86_PAE */ -/* Lazy mode for batching updates / context switch */ -enum paravirt_lazy_mode { - PARAVIRT_LAZY_NONE, - PARAVIRT_LAZY_MMU, - PARAVIRT_LAZY_CPU, -}; - -enum paravirt_lazy_mode paravirt_get_lazy_mode(void); -void paravirt_start_context_switch(struct task_struct *prev); -void paravirt_end_context_switch(struct task_struct *next); - -void paravirt_enter_lazy_mmu(void); -void paravirt_leave_lazy_mmu(void); - #define __HAVE_ARCH_START_CONTEXT_SWITCH static inline void arch_start_context_switch(struct task_struct *prev) { @@ -1437,12 +745,6 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, pv_mmu_ops.set_fixmap(idx, phys, flags); } -void _paravirt_nop(void); -u32 _paravirt_ident_32(u32); -u64 _paravirt_ident_64(u64); - -#define paravirt_nop ((void *)_paravirt_nop) - #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) static inline int __raw_spin_is_locked(struct raw_spinlock *lock) @@ -1479,17 +781,6 @@ static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock) #endif -/* These all sit in the .parainstructions section to tell us what to patch. */ -struct paravirt_patch_site { - u8 *instr; /* original instructions */ - u8 instrtype; /* type of this instruction */ - u8 len; /* length of original instruction */ - u16 clobbers; /* what registers you may clobber */ -}; - -extern struct paravirt_patch_site __parainstructions[], - __parainstructions_end[]; - #ifdef CONFIG_X86_32 #define PV_SAVE_REGS "pushl %ecx; pushl %edx;" #define PV_RESTORE_REGS "popl %edx; popl %ecx;" diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h new file mode 100644 index 000000000000..2b3371bae295 --- /dev/null +++ b/arch/x86/include/asm/paravirt_types.h @@ -0,0 +1,720 @@ +#ifndef _ASM_X86_PARAVIRT_TYPES_H +#define _ASM_X86_PARAVIRT_TYPES_H + +/* Bitmask of what can be clobbered: usually at least eax. */ +#define CLBR_NONE 0 +#define CLBR_EAX (1 << 0) +#define CLBR_ECX (1 << 1) +#define CLBR_EDX (1 << 2) +#define CLBR_EDI (1 << 3) + +#ifdef CONFIG_X86_32 +/* CLBR_ANY should match all regs platform has. For i386, that's just it */ +#define CLBR_ANY ((1 << 4) - 1) + +#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX) +#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX) +#define CLBR_SCRATCH (0) +#else +#define CLBR_RAX CLBR_EAX +#define CLBR_RCX CLBR_ECX +#define CLBR_RDX CLBR_EDX +#define CLBR_RDI CLBR_EDI +#define CLBR_RSI (1 << 4) +#define CLBR_R8 (1 << 5) +#define CLBR_R9 (1 << 6) +#define CLBR_R10 (1 << 7) +#define CLBR_R11 (1 << 8) + +#define CLBR_ANY ((1 << 9) - 1) + +#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \ + CLBR_RCX | CLBR_R8 | CLBR_R9) +#define CLBR_RET_REG (CLBR_RAX) +#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11) + +#endif /* X86_64 */ + +#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG) + +#ifndef __ASSEMBLY__ + +#include +#include + +struct page; +struct thread_struct; +struct desc_ptr; +struct tss_struct; +struct mm_struct; +struct desc_struct; +struct task_struct; +struct cpumask; + +/* + * Wrapper type for pointers to code which uses the non-standard + * calling convention. See PV_CALL_SAVE_REGS_THUNK below. + */ +struct paravirt_callee_save { + void *func; +}; + +/* general info */ +struct pv_info { + unsigned int kernel_rpl; + int shared_kernel_pmd; + int paravirt_enabled; + const char *name; +}; + +struct pv_init_ops { + /* + * Patch may replace one of the defined code sequences with + * arbitrary code, subject to the same register constraints. + * This generally means the code is not free to clobber any + * registers other than EAX. The patch function should return + * the number of bytes of code generated, as we nop pad the + * rest in generic code. + */ + unsigned (*patch)(u8 type, u16 clobber, void *insnbuf, + unsigned long addr, unsigned len); + + /* Basic arch-specific setup */ + void (*arch_setup)(void); + char *(*memory_setup)(void); + void (*post_allocator_init)(void); + + /* Print a banner to identify the environment */ + void (*banner)(void); +}; + + +struct pv_lazy_ops { + /* Set deferred update mode, used for batching operations. */ + void (*enter)(void); + void (*leave)(void); +}; + +struct pv_time_ops { + void (*time_init)(void); + + /* Set and set time of day */ + unsigned long (*get_wallclock)(void); + int (*set_wallclock)(unsigned long); + + unsigned long long (*sched_clock)(void); + unsigned long (*get_tsc_khz)(void); +}; + +struct pv_cpu_ops { + /* hooks for various privileged instructions */ + unsigned long (*get_debugreg)(int regno); + void (*set_debugreg)(int regno, unsigned long value); + + void (*clts)(void); + + unsigned long (*read_cr0)(void); + void (*write_cr0)(unsigned long); + + unsigned long (*read_cr4_safe)(void); + unsigned long (*read_cr4)(void); + void (*write_cr4)(unsigned long); + +#ifdef CONFIG_X86_64 + unsigned long (*read_cr8)(void); + void (*write_cr8)(unsigned long); +#endif + + /* Segment descriptor handling */ + void (*load_tr_desc)(void); + void (*load_gdt)(const struct desc_ptr *); + void (*load_idt)(const struct desc_ptr *); + void (*store_gdt)(struct desc_ptr *); + void (*store_idt)(struct desc_ptr *); + void (*set_ldt)(const void *desc, unsigned entries); + unsigned long (*store_tr)(void); + void (*load_tls)(struct thread_struct *t, unsigned int cpu); +#ifdef CONFIG_X86_64 + void (*load_gs_index)(unsigned int idx); +#endif + void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum, + const void *desc); + void (*write_gdt_entry)(struct desc_struct *, + int entrynum, const void *desc, int size); + void (*write_idt_entry)(gate_desc *, + int entrynum, const gate_desc *gate); + void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries); + void (*free_ldt)(struct desc_struct *ldt, unsigned entries); + + void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); + + void (*set_iopl_mask)(unsigned mask); + + void (*wbinvd)(void); + void (*io_delay)(void); + + /* cpuid emulation, mostly so that caps bits can be disabled */ + void (*cpuid)(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx); + + /* MSR, PMC and TSR operations. + err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ + u64 (*read_msr_amd)(unsigned int msr, int *err); + u64 (*read_msr)(unsigned int msr, int *err); + int (*write_msr)(unsigned int msr, unsigned low, unsigned high); + + u64 (*read_tsc)(void); + u64 (*read_pmc)(int counter); + unsigned long long (*read_tscp)(unsigned int *aux); + + /* + * Atomically enable interrupts and return to userspace. This + * is only ever used to return to 32-bit processes; in a + * 64-bit kernel, it's used for 32-on-64 compat processes, but + * never native 64-bit processes. (Jump, not call.) + */ + void (*irq_enable_sysexit)(void); + + /* + * Switch to usermode gs and return to 64-bit usermode using + * sysret. Only used in 64-bit kernels to return to 64-bit + * processes. Usermode register state, including %rsp, must + * already be restored. + */ + void (*usergs_sysret64)(void); + + /* + * Switch to usermode gs and return to 32-bit usermode using + * sysret. Used to return to 32-on-64 compat processes. + * Other usermode register state, including %esp, must already + * be restored. + */ + void (*usergs_sysret32)(void); + + /* Normal iret. Jump to this with the standard iret stack + frame set up. */ + void (*iret)(void); + + void (*swapgs)(void); + + void (*start_context_switch)(struct task_struct *prev); + void (*end_context_switch)(struct task_struct *next); +}; + +struct pv_irq_ops { + void (*init_IRQ)(void); + + /* + * Get/set interrupt state. save_fl and restore_fl are only + * expected to use X86_EFLAGS_IF; all other bits + * returned from save_fl are undefined, and may be ignored by + * restore_fl. + * + * NOTE: These functions callers expect the callee to preserve + * more registers than the standard C calling convention. + */ + struct paravirt_callee_save save_fl; + struct paravirt_callee_save restore_fl; + struct paravirt_callee_save irq_disable; + struct paravirt_callee_save irq_enable; + + void (*safe_halt)(void); + void (*halt)(void); + +#ifdef CONFIG_X86_64 + void (*adjust_exception_frame)(void); +#endif +}; + +struct pv_apic_ops { +#ifdef CONFIG_X86_LOCAL_APIC + void (*setup_boot_clock)(void); + void (*setup_secondary_clock)(void); + + void (*startup_ipi_hook)(int phys_apicid, + unsigned long start_eip, + unsigned long start_esp); +#endif +}; + +struct pv_mmu_ops { + /* + * Called before/after init_mm pagetable setup. setup_start + * may reset %cr3, and may pre-install parts of the pagetable; + * pagetable setup is expected to preserve any existing + * mapping. + */ + void (*pagetable_setup_start)(pgd_t *pgd_base); + void (*pagetable_setup_done)(pgd_t *pgd_base); + + unsigned long (*read_cr2)(void); + void (*write_cr2)(unsigned long); + + unsigned long (*read_cr3)(void); + void (*write_cr3)(unsigned long); + + /* + * Hooks for intercepting the creation/use/destruction of an + * mm_struct. + */ + void (*activate_mm)(struct mm_struct *prev, + struct mm_struct *next); + void (*dup_mmap)(struct mm_struct *oldmm, + struct mm_struct *mm); + void (*exit_mmap)(struct mm_struct *mm); + + + /* TLB operations */ + void (*flush_tlb_user)(void); + void (*flush_tlb_kernel)(void); + void (*flush_tlb_single)(unsigned long addr); + void (*flush_tlb_others)(const struct cpumask *cpus, + struct mm_struct *mm, + unsigned long va); + + /* Hooks for allocating and freeing a pagetable top-level */ + int (*pgd_alloc)(struct mm_struct *mm); + void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd); + + /* + * Hooks for allocating/releasing pagetable pages when they're + * attached to a pagetable + */ + void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn); + void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn); + void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count); + void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn); + void (*release_pte)(unsigned long pfn); + void (*release_pmd)(unsigned long pfn); + void (*release_pud)(unsigned long pfn); + + /* Pagetable manipulation functions */ + void (*set_pte)(pte_t *ptep, pte_t pteval); + void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval); + void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); + void (*pte_update)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + void (*pte_update_defer)(struct mm_struct *mm, + unsigned long addr, pte_t *ptep); + + pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); + + struct paravirt_callee_save pte_val; + struct paravirt_callee_save make_pte; + + struct paravirt_callee_save pgd_val; + struct paravirt_callee_save make_pgd; + +#if PAGETABLE_LEVELS >= 3 +#ifdef CONFIG_X86_PAE + void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); + void (*pte_clear)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + void (*pmd_clear)(pmd_t *pmdp); + +#endif /* CONFIG_X86_PAE */ + + void (*set_pud)(pud_t *pudp, pud_t pudval); + + struct paravirt_callee_save pmd_val; + struct paravirt_callee_save make_pmd; + +#if PAGETABLE_LEVELS == 4 + struct paravirt_callee_save pud_val; + struct paravirt_callee_save make_pud; + + void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); +#endif /* PAGETABLE_LEVELS == 4 */ +#endif /* PAGETABLE_LEVELS >= 3 */ + +#ifdef CONFIG_HIGHPTE + void *(*kmap_atomic_pte)(struct page *page, enum km_type type); +#endif + + struct pv_lazy_ops lazy_mode; + + /* dom0 ops */ + + /* Sometimes the physical address is a pfn, and sometimes its + an mfn. We can tell which is which from the index. */ + void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx, + phys_addr_t phys, pgprot_t flags); +}; + +struct raw_spinlock; +struct pv_lock_ops { + int (*spin_is_locked)(struct raw_spinlock *lock); + int (*spin_is_contended)(struct raw_spinlock *lock); + void (*spin_lock)(struct raw_spinlock *lock); + void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags); + int (*spin_trylock)(struct raw_spinlock *lock); + void (*spin_unlock)(struct raw_spinlock *lock); +}; + +/* This contains all the paravirt structures: we get a convenient + * number for each function using the offset which we use to indicate + * what to patch. */ +struct paravirt_patch_template { + struct pv_init_ops pv_init_ops; + struct pv_time_ops pv_time_ops; + struct pv_cpu_ops pv_cpu_ops; + struct pv_irq_ops pv_irq_ops; + struct pv_apic_ops pv_apic_ops; + struct pv_mmu_ops pv_mmu_ops; + struct pv_lock_ops pv_lock_ops; +}; + +extern struct pv_info pv_info; +extern struct pv_init_ops pv_init_ops; +extern struct pv_time_ops pv_time_ops; +extern struct pv_cpu_ops pv_cpu_ops; +extern struct pv_irq_ops pv_irq_ops; +extern struct pv_apic_ops pv_apic_ops; +extern struct pv_mmu_ops pv_mmu_ops; +extern struct pv_lock_ops pv_lock_ops; + +#define PARAVIRT_PATCH(x) \ + (offsetof(struct paravirt_patch_template, x) / sizeof(void *)) + +#define paravirt_type(op) \ + [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ + [paravirt_opptr] "i" (&(op)) +#define paravirt_clobber(clobber) \ + [paravirt_clobber] "i" (clobber) + +/* + * Generate some code, and mark it as patchable by the + * apply_paravirt() alternate instruction patcher. + */ +#define _paravirt_alt(insn_string, type, clobber) \ + "771:\n\t" insn_string "\n" "772:\n" \ + ".pushsection .parainstructions,\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR " 771b\n" \ + " .byte " type "\n" \ + " .byte 772b-771b\n" \ + " .short " clobber "\n" \ + ".popsection\n" + +/* Generate patchable code, with the default asm parameters. */ +#define paravirt_alt(insn_string) \ + _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") + +/* Simple instruction patching code. */ +#define DEF_NATIVE(ops, name, code) \ + extern const char start_##ops##_##name[], end_##ops##_##name[]; \ + asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") + +unsigned paravirt_patch_nop(void); +unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len); +unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len); +unsigned paravirt_patch_ignore(unsigned len); +unsigned paravirt_patch_call(void *insnbuf, + const void *target, u16 tgt_clobbers, + unsigned long addr, u16 site_clobbers, + unsigned len); +unsigned paravirt_patch_jmp(void *insnbuf, const void *target, + unsigned long addr, unsigned len); +unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, + unsigned long addr, unsigned len); + +unsigned paravirt_patch_insns(void *insnbuf, unsigned len, + const char *start, const char *end); + +unsigned native_patch(u8 type, u16 clobbers, void *ibuf, + unsigned long addr, unsigned len); + +int paravirt_disable_iospace(void); + +/* + * This generates an indirect call based on the operation type number. + * The type number, computed in PARAVIRT_PATCH, is derived from the + * offset into the paravirt_patch_template structure, and can therefore be + * freely converted back into a structure offset. + */ +#define PARAVIRT_CALL "call *%c[paravirt_opptr];" + +/* + * These macros are intended to wrap calls through one of the paravirt + * ops structs, so that they can be later identified and patched at + * runtime. + * + * Normally, a call to a pv_op function is a simple indirect call: + * (pv_op_struct.operations)(args...). + * + * Unfortunately, this is a relatively slow operation for modern CPUs, + * because it cannot necessarily determine what the destination + * address is. In this case, the address is a runtime constant, so at + * the very least we can patch the call to e a simple direct call, or + * ideally, patch an inline implementation into the callsite. (Direct + * calls are essentially free, because the call and return addresses + * are completely predictable.) + * + * For i386, these macros rely on the standard gcc "regparm(3)" calling + * convention, in which the first three arguments are placed in %eax, + * %edx, %ecx (in that order), and the remaining arguments are placed + * on the stack. All caller-save registers (eax,edx,ecx) are expected + * to be modified (either clobbered or used for return values). + * X86_64, on the other hand, already specifies a register-based calling + * conventions, returning at %rax, with parameteres going on %rdi, %rsi, + * %rdx, and %rcx. Note that for this reason, x86_64 does not need any + * special handling for dealing with 4 arguments, unlike i386. + * However, x86_64 also have to clobber all caller saved registers, which + * unfortunately, are quite a bit (r8 - r11) + * + * The call instruction itself is marked by placing its start address + * and size into the .parainstructions section, so that + * apply_paravirt() in arch/i386/kernel/alternative.c can do the + * appropriate patching under the control of the backend pv_init_ops + * implementation. + * + * Unfortunately there's no way to get gcc to generate the args setup + * for the call, and then allow the call itself to be generated by an + * inline asm. Because of this, we must do the complete arg setup and + * return value handling from within these macros. This is fairly + * cumbersome. + * + * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments. + * It could be extended to more arguments, but there would be little + * to be gained from that. For each number of arguments, there are + * the two VCALL and CALL variants for void and non-void functions. + * + * When there is a return value, the invoker of the macro must specify + * the return type. The macro then uses sizeof() on that type to + * determine whether its a 32 or 64 bit value, and places the return + * in the right register(s) (just %eax for 32-bit, and %edx:%eax for + * 64-bit). For x86_64 machines, it just returns at %rax regardless of + * the return value size. + * + * 64-bit arguments are passed as a pair of adjacent 32-bit arguments + * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments + * in low,high order + * + * Small structures are passed and returned in registers. The macro + * calling convention can't directly deal with this, so the wrapper + * functions must do this. + * + * These PVOP_* macros are only defined within this header. This + * means that all uses must be wrapped in inline functions. This also + * makes sure the incoming and outgoing types are always correct. + */ +#ifdef CONFIG_X86_32 +#define PVOP_VCALL_ARGS \ + unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx +#define PVOP_CALL_ARGS PVOP_VCALL_ARGS + +#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x)) +#define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x)) +#define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x)) + +#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \ + "=c" (__ecx) +#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS + +#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), "=d" (__edx) +#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS + +#define EXTRA_CLOBBERS +#define VEXTRA_CLOBBERS +#else /* CONFIG_X86_64 */ +#define PVOP_VCALL_ARGS \ + unsigned long __edi = __edi, __esi = __esi, \ + __edx = __edx, __ecx = __ecx +#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax + +#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) +#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x)) +#define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x)) +#define PVOP_CALL_ARG4(x) "c" ((unsigned long)(x)) + +#define PVOP_VCALL_CLOBBERS "=D" (__edi), \ + "=S" (__esi), "=d" (__edx), \ + "=c" (__ecx) +#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) + +#define PVOP_VCALLEE_CLOBBERS "=a" (__eax) +#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS + +#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11" +#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11" +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_PARAVIRT_DEBUG +#define PVOP_TEST_NULL(op) BUG_ON(op == NULL) +#else +#define PVOP_TEST_NULL(op) ((void)op) +#endif + +#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \ + pre, post, ...) \ + ({ \ + rettype __ret; \ + PVOP_CALL_ARGS; \ + PVOP_TEST_NULL(op); \ + /* This is 32-bit specific, but is okay in 64-bit */ \ + /* since this condition will never hold */ \ + if (sizeof(rettype) > sizeof(unsigned long)) { \ + asm volatile(pre \ + paravirt_alt(PARAVIRT_CALL) \ + post \ + : call_clbr \ + : paravirt_type(op), \ + paravirt_clobber(clbr), \ + ##__VA_ARGS__ \ + : "memory", "cc" extra_clbr); \ + __ret = (rettype)((((u64)__edx) << 32) | __eax); \ + } else { \ + asm volatile(pre \ + paravirt_alt(PARAVIRT_CALL) \ + post \ + : call_clbr \ + : paravirt_type(op), \ + paravirt_clobber(clbr), \ + ##__VA_ARGS__ \ + : "memory", "cc" extra_clbr); \ + __ret = (rettype)__eax; \ + } \ + __ret; \ + }) + +#define __PVOP_CALL(rettype, op, pre, post, ...) \ + ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \ + EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__) + +#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \ + ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ + PVOP_CALLEE_CLOBBERS, , \ + pre, post, ##__VA_ARGS__) + + +#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \ + ({ \ + PVOP_VCALL_ARGS; \ + PVOP_TEST_NULL(op); \ + asm volatile(pre \ + paravirt_alt(PARAVIRT_CALL) \ + post \ + : call_clbr \ + : paravirt_type(op), \ + paravirt_clobber(clbr), \ + ##__VA_ARGS__ \ + : "memory", "cc" extra_clbr); \ + }) + +#define __PVOP_VCALL(op, pre, post, ...) \ + ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \ + VEXTRA_CLOBBERS, \ + pre, post, ##__VA_ARGS__) + +#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \ + ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ + PVOP_VCALLEE_CLOBBERS, , \ + pre, post, ##__VA_ARGS__) + + + +#define PVOP_CALL0(rettype, op) \ + __PVOP_CALL(rettype, op, "", "") +#define PVOP_VCALL0(op) \ + __PVOP_VCALL(op, "", "") + +#define PVOP_CALLEE0(rettype, op) \ + __PVOP_CALLEESAVE(rettype, op, "", "") +#define PVOP_VCALLEE0(op) \ + __PVOP_VCALLEESAVE(op, "", "") + + +#define PVOP_CALL1(rettype, op, arg1) \ + __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) +#define PVOP_VCALL1(op, arg1) \ + __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1)) + +#define PVOP_CALLEE1(rettype, op, arg1) \ + __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) +#define PVOP_VCALLEE1(op, arg1) \ + __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1)) + + +#define PVOP_CALL2(rettype, op, arg1, arg2) \ + __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2)) +#define PVOP_VCALL2(op, arg1, arg2) \ + __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2)) + +#define PVOP_CALLEE2(rettype, op, arg1, arg2) \ + __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2)) +#define PVOP_VCALLEE2(op, arg1, arg2) \ + __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2)) + + +#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \ + __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) +#define PVOP_VCALL3(op, arg1, arg2, arg3) \ + __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) + +/* This is the only difference in x86_64. We can make it much simpler */ +#ifdef CONFIG_X86_32 +#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ + __PVOP_CALL(rettype, op, \ + "push %[_arg4];", "lea 4(%%esp),%%esp;", \ + PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ + PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4))) +#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ + __PVOP_VCALL(op, \ + "push %[_arg4];", "lea 4(%%esp),%%esp;", \ + "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ + "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) +#else +#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ + __PVOP_CALL(rettype, op, "", "", \ + PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ + PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) +#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ + __PVOP_VCALL(op, "", "", \ + PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ + PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) +#endif + +/* Lazy mode for batching updates / context switch */ +enum paravirt_lazy_mode { + PARAVIRT_LAZY_NONE, + PARAVIRT_LAZY_MMU, + PARAVIRT_LAZY_CPU, +}; + +enum paravirt_lazy_mode paravirt_get_lazy_mode(void); +void paravirt_start_context_switch(struct task_struct *prev); +void paravirt_end_context_switch(struct task_struct *next); + +void paravirt_enter_lazy_mmu(void); +void paravirt_leave_lazy_mmu(void); + +void _paravirt_nop(void); +u32 _paravirt_ident_32(u32); +u64 _paravirt_ident_64(u64); + +#define paravirt_nop ((void *)_paravirt_nop) + +/* These all sit in the .parainstructions section to tell us what to patch. */ +struct paravirt_patch_site { + u8 *instr; /* original instructions */ + u8 instrtype; /* type of this instruction */ + u8 len; /* length of original instruction */ + u16 clobbers; /* what registers you may clobber */ +}; + +extern struct paravirt_patch_site __parainstructions[], + __parainstructions_end[]; + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_PARAVIRT_TYPES_H */ -- cgit v1.2.3 From e6e9cac8c3417b43498b243c1f8f11780e157168 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 24 Apr 2009 00:40:59 -0700 Subject: x86: split out core __math_state_restore Split the core fpu state restoration out into __math_state_restore, which assumes that cr0.TS is clear and that the fpu context has been initialized. This will be used during context switch. There are two reasons this is desireable: - There's a small clarification. When __switch_to() calls math_state_restore, it relies on the fact that tsk_used_math() returns true, and so will never do a blocking init_fpu(). __math_state_restore() does not have (or need) that logic, so the question never arises. - It allows the clts() to be moved earler in __switch_to() so it can be performed while cpu context updates are batched (will be done in a later patch). [ Impact: refactor code to make reuse cleaner; no functional change ] Signed-off-by: Jeremy Fitzhardinge Cc: Alok Kataria Cc: Rusty Russell --- arch/x86/include/asm/i387.h | 1 + arch/x86/kernel/traps.c | 33 +++++++++++++++++++++++---------- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 175adf58dd4f..2e7529295f55 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -26,6 +26,7 @@ extern void fpu_init(void); extern void mxcsr_feature_mask_init(void); extern int init_fpu(struct task_struct *child); extern asmlinkage void math_state_restore(void); +extern void __math_state_restore(void); extern void init_thread_xstate(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 5f935f0d5861..71b91669ad19 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -813,6 +813,28 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) { } +/* + * __math_state_restore assumes that cr0.TS is already clear and the + * fpu state is all ready for use. Used during context switch. + */ +void __math_state_restore(void) +{ + struct thread_info *thread = current_thread_info(); + struct task_struct *tsk = thread->task; + + /* + * Paranoid restore. send a SIGSEGV if we fail to restore the state. + */ + if (unlikely(restore_fpu_checking(tsk))) { + stts(); + force_sig(SIGSEGV, tsk); + return; + } + + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ + tsk->fpu_counter++; +} + /* * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task @@ -844,17 +866,8 @@ asmlinkage void math_state_restore(void) } clts(); /* Allow maths ops (or we recurse) */ - /* - * Paranoid restore. send a SIGSEGV if we fail to restore the state. - */ - if (unlikely(restore_fpu_checking(tsk))) { - stts(); - force_sig(SIGSEGV, tsk); - return; - } - thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ - tsk->fpu_counter++; + __math_state_restore(); } EXPORT_SYMBOL_GPL(math_state_restore); -- cgit v1.2.3 From 2fcddce10f6771cfa0c56fd1e826d50d67d100b7 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 24 Apr 2009 00:45:26 -0700 Subject: x86-32: make sure clts is batched during context switch If we're preloading the fpu state during context switch, make sure the clts happens while we're batching the cpu context update, then do the actual __math_state_restore once the updates are flushed. This allows more efficient context switches when running paravirtualized, as all the hypercalls can be folded together into one. [ Impact: optimise paravirtual FPU context switch ] Signed-off-by: Jeremy Fitzhardinge Cc: Alok Kataria Cc: Rusty Russell --- arch/x86/kernel/process_32.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 59f4524984af..a80eddd41658 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -350,14 +350,21 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) *next = &next_p->thread; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); + bool preload_fpu; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - __unlazy_fpu(prev_p); + /* + * If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; + __unlazy_fpu(prev_p); /* we're going to use this soon, after a few expensive things */ - if (next_p->fpu_counter > 5) + if (preload_fpu) prefetch(next->xstate); /* @@ -398,6 +405,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) __switch_to_xtra(prev_p, next_p, tss); + /* If we're going to preload the fpu context, make sure clts + is run while we're batching the cpu state updates. */ + if (preload_fpu) + clts(); + /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so @@ -407,15 +419,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ arch_end_context_switch(next_p); - /* If the task has used fpu the last 5 timeslices, just do a full - * restore of the math state immediately to avoid the trap; the - * chances of needing FPU soon are obviously high now - * - * tsk_used_math() checks prevent calling math_state_restore(), - * which can sleep in the case of !tsk_used_math() - */ - if (tsk_used_math(next_p) && next_p->fpu_counter > 5) - math_state_restore(); + if (preload_fpu) + __math_state_restore(); /* * Restore %gs if needed (which is common) -- cgit v1.2.3 From 16d9dbf0c2bd167fdd942b83592d59696c7b73bd Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 24 Apr 2009 00:50:27 -0700 Subject: x86-64: move unlazy_fpu() into lazy cpu state part of context switch Make sure that unlazy_fpu()'s stts gets batched along with the other cpu state changes during context switch. (32-bit already does this.) This makes sure it gets batched when running paravirtualized. [ Impact: optimise paravirtual FPU context switch ] Signed-off-by: Jeremy Fitzhardinge Cc: Alok Kataria Cc: Rusty Russell --- arch/x86/kernel/process_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ebefb5407b9d..c9b8904736db 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -419,6 +419,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) load_TLS(next, cpu); + /* Must be after DS reload */ + unlazy_fpu(prev_p); + /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so @@ -459,9 +462,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) wrmsrl(MSR_KERNEL_GS_BASE, next->gs); prev->gsindex = gsindex; - /* Must be after DS reload */ - unlazy_fpu(prev_p); - /* * Switch the PDA and FPU contexts. */ -- cgit v1.2.3 From 17950c5b243f99cbabef173415ee988c52104d7e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 24 Apr 2009 01:01:01 -0700 Subject: x86-64: move clts into batch cpu state updates when preloading fpu When a task is likely to be using the fpu, we preload its state during the context switch, rather than waiting for it to run an fpu instruction. Make sure the clts() happens while we're doing batched fpu state updates to optimise paravirtualized context switches. [ Impact: optimise paravirtual FPU context switch ] Signed-off-by: Jeremy Fitzhardinge Cc: Alok Kataria Cc: Rusty Russell --- arch/x86/kernel/process_64.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c9b8904736db..a28279dbb07c 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -386,9 +386,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); unsigned fsindex, gsindex; + bool preload_fpu; + + /* + * If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; /* we're going to use this soon, after a few expensive things */ - if (next_p->fpu_counter > 5) + if (preload_fpu) prefetch(next->xstate); /* @@ -422,6 +430,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* Must be after DS reload */ unlazy_fpu(prev_p); + /* Make sure cpu is ready for new context */ + if (preload_fpu) + clts(); + /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so @@ -480,15 +492,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p, tss); - /* If the task has used fpu the last 5 timeslices, just do a full - * restore of the math state immediately to avoid the trap; the - * chances of needing FPU soon are obviously high now - * - * tsk_used_math() checks prevent calling math_state_restore(), - * which can sleep in the case of !tsk_used_math() + /* + * Preload the FPU context, now that we've determined that the + * task is likely to be using it. */ - if (tsk_used_math(next_p) && next_p->fpu_counter > 5) - math_state_restore(); + if (preload_fpu) + __math_state_restore(); return prev_p; } -- cgit v1.2.3 From 21e70878215f620fe99ea7d7c74bc641aeec932f Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 18 Jun 2009 17:09:27 +0530 Subject: x86: oprofile/op_model_amd.c set return values for op_amd_handle_ibs() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit op_amd_handle_ibs() should return 0 when IBS is not present or not defined. Fix compilation warning: CC [M] arch/x86/oprofile/op_model_amd.o arch/x86/oprofile/op_model_amd.c: In function ‘op_amd_handle_ibs’: arch/x86/oprofile/op_model_amd.c:217: warning: no return statement in function returning non-void Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index cc930467575d..e95268eb9220 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -132,7 +132,7 @@ op_amd_handle_ibs(struct pt_regs * const regs, struct op_entry entry; if (!has_ibs) - return 1; + return 0; if (ibs_config.fetch_enabled) { rdmsrl(MSR_AMD64_IBSFETCHCTL, ctl); @@ -214,7 +214,10 @@ static void op_amd_stop_ibs(void) #else static inline int op_amd_handle_ibs(struct pt_regs * const regs, - struct op_msrs const * const msrs) { } + struct op_msrs const * const msrs) +{ + return 0; +} static inline void op_amd_start_ibs(void) { } static inline void op_amd_stop_ibs(void) { } -- cgit v1.2.3 From 4adc667593f83a18a8e54ce94f250fd166a272ac Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 18 Jun 2009 21:48:16 +0200 Subject: x86: add copies of some headers to convert to asm-generic Just an intermediate step to make reviewing easier. These files are identical copies of the existing headers. Signed-off-by: Arnd Bergmann LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/generic-mman.h | 20 ++++++++ arch/x86/include/asm/generic-module.h | 80 ++++++++++++++++++++++++++++++ arch/x86/include/asm/generic-scatterlist.h | 33 ++++++++++++ arch/x86/include/asm/generic-types.h | 30 +++++++++++ arch/x86/include/asm/generic-ucontext.h | 18 +++++++ 5 files changed, 181 insertions(+) create mode 100644 arch/x86/include/asm/generic-mman.h create mode 100644 arch/x86/include/asm/generic-module.h create mode 100644 arch/x86/include/asm/generic-scatterlist.h create mode 100644 arch/x86/include/asm/generic-types.h create mode 100644 arch/x86/include/asm/generic-ucontext.h diff --git a/arch/x86/include/asm/generic-mman.h b/arch/x86/include/asm/generic-mman.h new file mode 100644 index 000000000000..751af2550ed9 --- /dev/null +++ b/arch/x86/include/asm/generic-mman.h @@ -0,0 +1,20 @@ +#ifndef _ASM_X86_MMAN_H +#define _ASM_X86_MMAN_H + +#include + +#define MAP_32BIT 0x40 /* only give out 32bit addresses */ + +#define MAP_GROWSDOWN 0x0100 /* stack-like segment */ +#define MAP_DENYWRITE 0x0800 /* ETXTBSY */ +#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ +#define MAP_LOCKED 0x2000 /* pages are locked */ +#define MAP_NORESERVE 0x4000 /* don't check for reservations */ +#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ +#define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ + +#define MCL_CURRENT 1 /* lock all current mappings */ +#define MCL_FUTURE 2 /* lock all future mappings */ + +#endif /* _ASM_X86_MMAN_H */ diff --git a/arch/x86/include/asm/generic-module.h b/arch/x86/include/asm/generic-module.h new file mode 100644 index 000000000000..47d62743c4d5 --- /dev/null +++ b/arch/x86/include/asm/generic-module.h @@ -0,0 +1,80 @@ +#ifndef _ASM_X86_MODULE_H +#define _ASM_X86_MODULE_H + +/* x86_32/64 are simple */ +struct mod_arch_specific {}; + +#ifdef CONFIG_X86_32 +# define Elf_Shdr Elf32_Shdr +# define Elf_Sym Elf32_Sym +# define Elf_Ehdr Elf32_Ehdr +#else +# define Elf_Shdr Elf64_Shdr +# define Elf_Sym Elf64_Sym +# define Elf_Ehdr Elf64_Ehdr +#endif + +#ifdef CONFIG_X86_64 +/* X86_64 does not define MODULE_PROC_FAMILY */ +#elif defined CONFIG_M386 +#define MODULE_PROC_FAMILY "386 " +#elif defined CONFIG_M486 +#define MODULE_PROC_FAMILY "486 " +#elif defined CONFIG_M586 +#define MODULE_PROC_FAMILY "586 " +#elif defined CONFIG_M586TSC +#define MODULE_PROC_FAMILY "586TSC " +#elif defined CONFIG_M586MMX +#define MODULE_PROC_FAMILY "586MMX " +#elif defined CONFIG_MCORE2 +#define MODULE_PROC_FAMILY "CORE2 " +#elif defined CONFIG_M686 +#define MODULE_PROC_FAMILY "686 " +#elif defined CONFIG_MPENTIUMII +#define MODULE_PROC_FAMILY "PENTIUMII " +#elif defined CONFIG_MPENTIUMIII +#define MODULE_PROC_FAMILY "PENTIUMIII " +#elif defined CONFIG_MPENTIUMM +#define MODULE_PROC_FAMILY "PENTIUMM " +#elif defined CONFIG_MPENTIUM4 +#define MODULE_PROC_FAMILY "PENTIUM4 " +#elif defined CONFIG_MK6 +#define MODULE_PROC_FAMILY "K6 " +#elif defined CONFIG_MK7 +#define MODULE_PROC_FAMILY "K7 " +#elif defined CONFIG_MK8 +#define MODULE_PROC_FAMILY "K8 " +#elif defined CONFIG_X86_ELAN +#define MODULE_PROC_FAMILY "ELAN " +#elif defined CONFIG_MCRUSOE +#define MODULE_PROC_FAMILY "CRUSOE " +#elif defined CONFIG_MEFFICEON +#define MODULE_PROC_FAMILY "EFFICEON " +#elif defined CONFIG_MWINCHIPC6 +#define MODULE_PROC_FAMILY "WINCHIPC6 " +#elif defined CONFIG_MWINCHIP3D +#define MODULE_PROC_FAMILY "WINCHIP3D " +#elif defined CONFIG_MCYRIXIII +#define MODULE_PROC_FAMILY "CYRIXIII " +#elif defined CONFIG_MVIAC3_2 +#define MODULE_PROC_FAMILY "VIAC3-2 " +#elif defined CONFIG_MVIAC7 +#define MODULE_PROC_FAMILY "VIAC7 " +#elif defined CONFIG_MGEODEGX1 +#define MODULE_PROC_FAMILY "GEODEGX1 " +#elif defined CONFIG_MGEODE_LX +#define MODULE_PROC_FAMILY "GEODE " +#else +#error unknown processor family +#endif + +#ifdef CONFIG_X86_32 +# ifdef CONFIG_4KSTACKS +# define MODULE_STACKSIZE "4KSTACKS " +# else +# define MODULE_STACKSIZE "" +# endif +# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE +#endif + +#endif /* _ASM_X86_MODULE_H */ diff --git a/arch/x86/include/asm/generic-scatterlist.h b/arch/x86/include/asm/generic-scatterlist.h new file mode 100644 index 000000000000..263d397d2eef --- /dev/null +++ b/arch/x86/include/asm/generic-scatterlist.h @@ -0,0 +1,33 @@ +#ifndef _ASM_X86_SCATTERLIST_H +#define _ASM_X86_SCATTERLIST_H + +#include + +struct scatterlist { +#ifdef CONFIG_DEBUG_SG + unsigned long sg_magic; +#endif + unsigned long page_link; + unsigned int offset; + unsigned int length; + dma_addr_t dma_address; + unsigned int dma_length; +}; + +#define ARCH_HAS_SG_CHAIN +#define ISA_DMA_THRESHOLD (0x00ffffff) + +/* + * These macros should be used after a pci_map_sg call has been done + * to get bus addresses of each of the SG entries and their lengths. + * You should only work with the number of sg entries pci_map_sg + * returns. + */ +#define sg_dma_address(sg) ((sg)->dma_address) +#ifdef CONFIG_X86_32 +# define sg_dma_len(sg) ((sg)->length) +#else +# define sg_dma_len(sg) ((sg)->dma_length) +#endif + +#endif /* _ASM_X86_SCATTERLIST_H */ diff --git a/arch/x86/include/asm/generic-types.h b/arch/x86/include/asm/generic-types.h new file mode 100644 index 000000000000..09b97745772f --- /dev/null +++ b/arch/x86/include/asm/generic-types.h @@ -0,0 +1,30 @@ +#ifndef _ASM_X86_TYPES_H +#define _ASM_X86_TYPES_H + +#include + +#ifndef __ASSEMBLY__ + +typedef unsigned short umode_t; + +#endif /* __ASSEMBLY__ */ + +/* + * These aren't exported outside the kernel to avoid name space clashes + */ +#ifdef __KERNEL__ + +#ifndef __ASSEMBLY__ + +typedef u64 dma64_addr_t; +#if defined(CONFIG_X86_64) || defined(CONFIG_HIGHMEM64G) +/* DMA addresses come in 32-bit and 64-bit flavours. */ +typedef u64 dma_addr_t; +#else +typedef u32 dma_addr_t; +#endif + +#endif /* __ASSEMBLY__ */ +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_TYPES_H */ diff --git a/arch/x86/include/asm/generic-ucontext.h b/arch/x86/include/asm/generic-ucontext.h new file mode 100644 index 000000000000..87324cf439d9 --- /dev/null +++ b/arch/x86/include/asm/generic-ucontext.h @@ -0,0 +1,18 @@ +#ifndef _ASM_X86_UCONTEXT_H +#define _ASM_X86_UCONTEXT_H + +#define UC_FP_XSTATE 0x1 /* indicates the presence of extended state + * information in the memory layout pointed + * by the fpstate pointer in the ucontext's + * sigcontext struct (uc_mcontext). + */ + +struct ucontext { + unsigned long uc_flags; + struct ucontext *uc_link; + stack_t uc_stack; + struct sigcontext uc_mcontext; + sigset_t uc_sigmask; /* mask last for extensibility */ +}; + +#endif /* _ASM_X86_UCONTEXT_H */ -- cgit v1.2.3 From 7bfd124d6dae7d394e73753300594a81a022fe7d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 18 Jun 2009 21:48:17 +0200 Subject: x86: convert trivial headers to asm-generic version For these nine header files, the asm-generic version should be semantically identical to what is in x86. Change the contents to be binary identical, for better review. Signed-off-by: Arnd Bergmann LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/ioctls.h | 40 ++++++++++++++++++++++++++++------------ arch/x86/include/asm/ipcbuf.h | 15 ++++++++++----- arch/x86/include/asm/msgbuf.h | 30 +++++++++++++++++++----------- arch/x86/include/asm/param.h | 8 +++++--- arch/x86/include/asm/shmbuf.h | 28 ++++++++++++++++++---------- arch/x86/include/asm/socket.h | 10 +++++----- arch/x86/include/asm/sockios.h | 6 +++--- arch/x86/include/asm/termbits.h | 8 ++++---- 8 files changed, 92 insertions(+), 53 deletions(-) diff --git a/arch/x86/include/asm/ioctls.h b/arch/x86/include/asm/ioctls.h index 0d5b23b7b06e..a799e20a769e 100644 --- a/arch/x86/include/asm/ioctls.h +++ b/arch/x86/include/asm/ioctls.h @@ -1,12 +1,23 @@ -#ifndef _ASM_X86_IOCTLS_H -#define _ASM_X86_IOCTLS_H +#ifndef __ASM_GENERIC_IOCTLS_H +#define __ASM_GENERIC_IOCTLS_H -#include +#include + +/* + * These are the most common definitions for tty ioctl numbers. + * Most of them do not use the recommended _IOC(), but there is + * probably some source code out there hardcoding the number, + * so we might as well use them for all new platforms. + * + * The architectures that use different values here typically + * try to be compatible with some Unix variants for the same + * architecture. + */ /* 0x54 is just a magic number to make these relatively unique ('T') */ #define TCGETS 0x5401 -#define TCSETS 0x5402 /* Clashes with SNDCTL_TMR_START sound ioctl */ +#define TCSETS 0x5402 #define TCSETSW 0x5403 #define TCSETSF 0x5404 #define TCGETA 0x5405 @@ -43,7 +54,6 @@ #define TIOCSETD 0x5423 #define TIOCGETD 0x5424 #define TCSBRKP 0x5425 /* Needed for POSIX tcsendbreak() */ -/* #define TIOCTTYGSTRUCT 0x5426 - Former debugging-only ioctl */ #define TIOCSBRK 0x5427 /* BSD compatibility */ #define TIOCCBRK 0x5428 /* BSD compatibility */ #define TIOCGSID 0x5429 /* Return the session ID of FD */ @@ -53,8 +63,7 @@ #define TCSETSF2 _IOW('T', 0x2D, struct termios2) #define TIOCGRS485 0x542E #define TIOCSRS485 0x542F -#define TIOCGPTN _IOR('T', 0x30, unsigned int) - /* Get Pty Number (of pty-mux device) */ +#define TIOCGPTN _IOR('T', 0x30, unsigned int) /* Get Pty Number (of pty-mux device) */ #define TIOCSPTLCK _IOW('T', 0x31, int) /* Lock/unlock Pty */ #define TCGETX 0x5432 /* SYS5 TCGETX compatibility */ #define TCSETX 0x5433 @@ -76,9 +85,16 @@ #define TIOCMIWAIT 0x545C /* wait for a change on serial input line(s) */ #define TIOCGICOUNT 0x545D /* read serial port inline interrupt counts */ -#define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */ -#define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */ -#define FIOQSIZE 0x5460 + +/* + * some architectures define FIOQSIZE as 0x545E, which is used for + * TIOCGHAYESESP on others + */ +#ifndef FIOQSIZE +# define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */ +# define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */ +# define FIOQSIZE 0x5460 +#endif /* Used for packet mode */ #define TIOCPKT_DATA 0 @@ -89,6 +105,6 @@ #define TIOCPKT_NOSTOP 16 #define TIOCPKT_DOSTOP 32 -#define TIOCSER_TEMT 0x01 /* Transmitter physically empty */ +#define TIOCSER_TEMT 0x01 /* Transmitter physically empty */ -#endif /* _ASM_X86_IOCTLS_H */ +#endif /* __ASM_GENERIC_IOCTLS_H */ diff --git a/arch/x86/include/asm/ipcbuf.h b/arch/x86/include/asm/ipcbuf.h index ee678fd51594..888ca1c8f951 100644 --- a/arch/x86/include/asm/ipcbuf.h +++ b/arch/x86/include/asm/ipcbuf.h @@ -1,13 +1,18 @@ -#ifndef _ASM_X86_IPCBUF_H -#define _ASM_X86_IPCBUF_H +#ifndef __ASM_GENERIC_IPCBUF_H +#define __ASM_GENERIC_IPCBUF_H /* - * The ipc64_perm structure for x86 architecture. + * The generic ipc64_perm structure: * Note extra padding because this structure is passed back and forth * between kernel and user space. * + * ipc64_perm was originally meant to be architecture specific, but + * everyone just ended up making identical copies without specific + * optimizations, so we may just as well all use the same one. + * * Pad space is left for: - * - 32-bit mode_t and seq + * - 32-bit mode_t on architectures that only had 16 bit + * - 32-bit seq * - 2 miscellaneous 32-bit values */ @@ -25,4 +30,4 @@ struct ipc64_perm { unsigned long __unused2; }; -#endif /* _ASM_X86_IPCBUF_H */ +#endif /* __ASM_GENERIC_IPCBUF_H */ diff --git a/arch/x86/include/asm/msgbuf.h b/arch/x86/include/asm/msgbuf.h index 7e4e9481f51c..aec850d9159e 100644 --- a/arch/x86/include/asm/msgbuf.h +++ b/arch/x86/include/asm/msgbuf.h @@ -1,30 +1,38 @@ -#ifndef _ASM_X86_MSGBUF_H -#define _ASM_X86_MSGBUF_H +#ifndef __ASM_GENERIC_MSGBUF_H +#define __ASM_GENERIC_MSGBUF_H +#include /* - * The msqid64_ds structure for i386 architecture. + * generic msqid64_ds structure. + * * Note extra padding because this structure is passed back and forth * between kernel and user space. * - * Pad space on i386 is left for: + * msqid64_ds was originally meant to be architecture specific, but + * everyone just ended up making identical copies without specific + * optimizations, so we may just as well all use the same one. + * + * 64 bit architectures typically define a 64 bit __kernel_time_t, + * so they do not need the first three padding words. + * On big-endian systems, the padding is in the wrong place. + * + * Pad space is left for: * - 64-bit time_t to solve y2038 problem * - 2 miscellaneous 32-bit values - * - * Pad space on x8664 is left for: - * - 2 miscellaneous 64-bit values */ + struct msqid64_ds { struct ipc64_perm msg_perm; __kernel_time_t msg_stime; /* last msgsnd time */ -#ifdef __i386__ +#if __BITS_PER_LONG != 64 unsigned long __unused1; #endif __kernel_time_t msg_rtime; /* last msgrcv time */ -#ifdef __i386__ +#if __BITS_PER_LONG != 64 unsigned long __unused2; #endif __kernel_time_t msg_ctime; /* last change time */ -#ifdef __i386__ +#if __BITS_PER_LONG != 64 unsigned long __unused3; #endif unsigned long msg_cbytes; /* current number of bytes on queue */ @@ -36,4 +44,4 @@ struct msqid64_ds { unsigned long __unused5; }; -#endif /* _ASM_X86_MSGBUF_H */ +#endif /* __ASM_GENERIC_MSGBUF_H */ diff --git a/arch/x86/include/asm/param.h b/arch/x86/include/asm/param.h index 6f0d0422f4ca..cdf8251bfb6c 100644 --- a/arch/x86/include/asm/param.h +++ b/arch/x86/include/asm/param.h @@ -1,5 +1,5 @@ -#ifndef _ASM_X86_PARAM_H -#define _ASM_X86_PARAM_H +#ifndef __ASM_GENERIC_PARAM_H +#define __ASM_GENERIC_PARAM_H #ifdef __KERNEL__ # define HZ CONFIG_HZ /* Internal kernel timer frequency */ @@ -11,7 +11,9 @@ #define HZ 100 #endif +#ifndef EXEC_PAGESIZE #define EXEC_PAGESIZE 4096 +#endif #ifndef NOGROUP #define NOGROUP (-1) @@ -19,4 +21,4 @@ #define MAXHOSTNAMELEN 64 /* max length of hostname */ -#endif /* _ASM_X86_PARAM_H */ +#endif /* __ASM_GENERIC_PARAM_H */ diff --git a/arch/x86/include/asm/shmbuf.h b/arch/x86/include/asm/shmbuf.h index b51413b74971..5768fa60ac82 100644 --- a/arch/x86/include/asm/shmbuf.h +++ b/arch/x86/include/asm/shmbuf.h @@ -1,32 +1,40 @@ -#ifndef _ASM_X86_SHMBUF_H -#define _ASM_X86_SHMBUF_H +#ifndef __ASM_GENERIC_SHMBUF_H +#define __ASM_GENERIC_SHMBUF_H + +#include /* * The shmid64_ds structure for x86 architecture. * Note extra padding because this structure is passed back and forth * between kernel and user space. * - * Pad space on 32 bit is left for: + * shmid64_ds was originally meant to be architecture specific, but + * everyone just ended up making identical copies without specific + * optimizations, so we may just as well all use the same one. + * + * 64 bit architectures typically define a 64 bit __kernel_time_t, + * so they do not need the first two padding words. + * On big-endian systems, the padding is in the wrong place. + * + * + * Pad space is left for: * - 64-bit time_t to solve y2038 problem * - 2 miscellaneous 32-bit values - * - * Pad space on 64 bit is left for: - * - 2 miscellaneous 64-bit values */ struct shmid64_ds { struct ipc64_perm shm_perm; /* operation perms */ size_t shm_segsz; /* size of segment (bytes) */ __kernel_time_t shm_atime; /* last attach time */ -#ifdef __i386__ +#if __BITS_PER_LONG != 64 unsigned long __unused1; #endif __kernel_time_t shm_dtime; /* last detach time */ -#ifdef __i386__ +#if __BITS_PER_LONG != 64 unsigned long __unused2; #endif __kernel_time_t shm_ctime; /* last change time */ -#ifdef __i386__ +#if __BITS_PER_LONG != 64 unsigned long __unused3; #endif __kernel_pid_t shm_cpid; /* pid of creator */ @@ -48,4 +56,4 @@ struct shminfo64 { unsigned long __unused4; }; -#endif /* _ASM_X86_SHMBUF_H */ +#endif /* __ASM_GENERIC_SHMBUF_H */ diff --git a/arch/x86/include/asm/socket.h b/arch/x86/include/asm/socket.h index ca8bf2cd0ba9..d4ae42a06a20 100644 --- a/arch/x86/include/asm/socket.h +++ b/arch/x86/include/asm/socket.h @@ -1,5 +1,5 @@ -#ifndef _ASM_X86_SOCKET_H -#define _ASM_X86_SOCKET_H +#ifndef __ASM_GENERIC_SOCKET_H +#define __ASM_GENERIC_SOCKET_H #include @@ -38,8 +38,8 @@ #define SO_BINDTODEVICE 25 /* Socket filtering */ -#define SO_ATTACH_FILTER 26 -#define SO_DETACH_FILTER 27 +#define SO_ATTACH_FILTER 26 +#define SO_DETACH_FILTER 27 #define SO_PEERNAME 28 #define SO_TIMESTAMP 29 @@ -57,4 +57,4 @@ #define SO_TIMESTAMPING 37 #define SCM_TIMESTAMPING SO_TIMESTAMPING -#endif /* _ASM_X86_SOCKET_H */ +#endif /* __ASM_GENERIC_SOCKET_H */ diff --git a/arch/x86/include/asm/sockios.h b/arch/x86/include/asm/sockios.h index 49cc72b5d3c9..9a61a369b901 100644 --- a/arch/x86/include/asm/sockios.h +++ b/arch/x86/include/asm/sockios.h @@ -1,5 +1,5 @@ -#ifndef _ASM_X86_SOCKIOS_H -#define _ASM_X86_SOCKIOS_H +#ifndef __ASM_GENERIC_SOCKIOS_H +#define __ASM_GENERIC_SOCKIOS_H /* Socket-level I/O control calls. */ #define FIOSETOWN 0x8901 @@ -10,4 +10,4 @@ #define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */ #define SIOCGSTAMPNS 0x8907 /* Get stamp (timespec) */ -#endif /* _ASM_X86_SOCKIOS_H */ +#endif /* __ASM_GENERIC_SOCKIOS_H */ diff --git a/arch/x86/include/asm/termbits.h b/arch/x86/include/asm/termbits.h index af1b70ea440f..1c9773d48cb0 100644 --- a/arch/x86/include/asm/termbits.h +++ b/arch/x86/include/asm/termbits.h @@ -1,5 +1,5 @@ -#ifndef _ASM_X86_TERMBITS_H -#define _ASM_X86_TERMBITS_H +#ifndef __ASM_GENERIC_TERMBITS_H +#define __ASM_GENERIC_TERMBITS_H #include @@ -140,7 +140,7 @@ struct ktermios { #define HUPCL 0002000 #define CLOCAL 0004000 #define CBAUDEX 0010000 -#define BOTHER 0010000 /* non standard rate */ +#define BOTHER 0010000 #define B57600 0010001 #define B115200 0010002 #define B230400 0010003 @@ -195,4 +195,4 @@ struct ktermios { #define TCSADRAIN 1 #define TCSAFLUSH 2 -#endif /* _ASM_X86_TERMBITS_H */ +#endif /* __ASM_GENERIC_TERMBITS_H */ -- cgit v1.2.3 From 06f5013aa8eb5895ced2c71d13f5114103605555 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 18 Jun 2009 21:48:18 +0200 Subject: x86: convert almost generic headers to asm-generic version In x86, mman.h, module.h, scatterlist.h, types.h and ucontext.h can use the asm-generic version by just defining the x86 specific parts locally and falling back on the generic code for the common bits. This patch illustrates the differences between the x86 and asm-generic versions by changing a file that is initially identical to the x86 version to one that is identical to the asm-generic version. Signed-off-by: Arnd Bergmann LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/generic-mman.h | 8 +-- arch/x86/include/asm/generic-module.h | 92 ++++++------------------------ arch/x86/include/asm/generic-scatterlist.h | 34 +++++++---- arch/x86/include/asm/generic-types.h | 32 +++++++---- arch/x86/include/asm/generic-ucontext.h | 12 +--- arch/x86/include/asm/mman.h | 14 +---- arch/x86/include/asm/module.h | 13 +---- arch/x86/include/asm/scatterlist.h | 27 +-------- arch/x86/include/asm/types.h | 12 +--- arch/x86/include/asm/ucontext.h | 8 +-- 10 files changed, 73 insertions(+), 179 deletions(-) diff --git a/arch/x86/include/asm/generic-mman.h b/arch/x86/include/asm/generic-mman.h index 751af2550ed9..7cab4de2bca6 100644 --- a/arch/x86/include/asm/generic-mman.h +++ b/arch/x86/include/asm/generic-mman.h @@ -1,10 +1,8 @@ -#ifndef _ASM_X86_MMAN_H -#define _ASM_X86_MMAN_H +#ifndef __ASM_GENERIC_MMAN_H +#define __ASM_GENERIC_MMAN_H #include -#define MAP_32BIT 0x40 /* only give out 32bit addresses */ - #define MAP_GROWSDOWN 0x0100 /* stack-like segment */ #define MAP_DENYWRITE 0x0800 /* ETXTBSY */ #define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ @@ -17,4 +15,4 @@ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ -#endif /* _ASM_X86_MMAN_H */ +#endif /* __ASM_GENERIC_MMAN_H */ diff --git a/arch/x86/include/asm/generic-module.h b/arch/x86/include/asm/generic-module.h index 47d62743c4d5..ed5b44de4c91 100644 --- a/arch/x86/include/asm/generic-module.h +++ b/arch/x86/include/asm/generic-module.h @@ -1,80 +1,22 @@ -#ifndef _ASM_X86_MODULE_H -#define _ASM_X86_MODULE_H +#ifndef __ASM_GENERIC_MODULE_H +#define __ASM_GENERIC_MODULE_H -/* x86_32/64 are simple */ -struct mod_arch_specific {}; +/* + * Many architectures just need a simple module + * loader without arch specific data. + */ +struct mod_arch_specific +{ +}; -#ifdef CONFIG_X86_32 -# define Elf_Shdr Elf32_Shdr -# define Elf_Sym Elf32_Sym -# define Elf_Ehdr Elf32_Ehdr +#ifdef CONFIG_64BIT +#define Elf_Shdr Elf64_Shdr +#define Elf_Sym Elf64_Sym +#define Elf_Ehdr Elf64_Ehdr #else -# define Elf_Shdr Elf64_Shdr -# define Elf_Sym Elf64_Sym -# define Elf_Ehdr Elf64_Ehdr +#define Elf_Shdr Elf32_Shdr +#define Elf_Sym Elf32_Sym +#define Elf_Ehdr Elf32_Ehdr #endif -#ifdef CONFIG_X86_64 -/* X86_64 does not define MODULE_PROC_FAMILY */ -#elif defined CONFIG_M386 -#define MODULE_PROC_FAMILY "386 " -#elif defined CONFIG_M486 -#define MODULE_PROC_FAMILY "486 " -#elif defined CONFIG_M586 -#define MODULE_PROC_FAMILY "586 " -#elif defined CONFIG_M586TSC -#define MODULE_PROC_FAMILY "586TSC " -#elif defined CONFIG_M586MMX -#define MODULE_PROC_FAMILY "586MMX " -#elif defined CONFIG_MCORE2 -#define MODULE_PROC_FAMILY "CORE2 " -#elif defined CONFIG_M686 -#define MODULE_PROC_FAMILY "686 " -#elif defined CONFIG_MPENTIUMII -#define MODULE_PROC_FAMILY "PENTIUMII " -#elif defined CONFIG_MPENTIUMIII -#define MODULE_PROC_FAMILY "PENTIUMIII " -#elif defined CONFIG_MPENTIUMM -#define MODULE_PROC_FAMILY "PENTIUMM " -#elif defined CONFIG_MPENTIUM4 -#define MODULE_PROC_FAMILY "PENTIUM4 " -#elif defined CONFIG_MK6 -#define MODULE_PROC_FAMILY "K6 " -#elif defined CONFIG_MK7 -#define MODULE_PROC_FAMILY "K7 " -#elif defined CONFIG_MK8 -#define MODULE_PROC_FAMILY "K8 " -#elif defined CONFIG_X86_ELAN -#define MODULE_PROC_FAMILY "ELAN " -#elif defined CONFIG_MCRUSOE -#define MODULE_PROC_FAMILY "CRUSOE " -#elif defined CONFIG_MEFFICEON -#define MODULE_PROC_FAMILY "EFFICEON " -#elif defined CONFIG_MWINCHIPC6 -#define MODULE_PROC_FAMILY "WINCHIPC6 " -#elif defined CONFIG_MWINCHIP3D -#define MODULE_PROC_FAMILY "WINCHIP3D " -#elif defined CONFIG_MCYRIXIII -#define MODULE_PROC_FAMILY "CYRIXIII " -#elif defined CONFIG_MVIAC3_2 -#define MODULE_PROC_FAMILY "VIAC3-2 " -#elif defined CONFIG_MVIAC7 -#define MODULE_PROC_FAMILY "VIAC7 " -#elif defined CONFIG_MGEODEGX1 -#define MODULE_PROC_FAMILY "GEODEGX1 " -#elif defined CONFIG_MGEODE_LX -#define MODULE_PROC_FAMILY "GEODE " -#else -#error unknown processor family -#endif - -#ifdef CONFIG_X86_32 -# ifdef CONFIG_4KSTACKS -# define MODULE_STACKSIZE "4KSTACKS " -# else -# define MODULE_STACKSIZE "" -# endif -# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE -#endif - -#endif /* _ASM_X86_MODULE_H */ +#endif /* __ASM_GENERIC_MODULE_H */ diff --git a/arch/x86/include/asm/generic-scatterlist.h b/arch/x86/include/asm/generic-scatterlist.h index 263d397d2eef..8b9454496a7c 100644 --- a/arch/x86/include/asm/generic-scatterlist.h +++ b/arch/x86/include/asm/generic-scatterlist.h @@ -1,7 +1,7 @@ -#ifndef _ASM_X86_SCATTERLIST_H -#define _ASM_X86_SCATTERLIST_H +#ifndef __ASM_GENERIC_SCATTERLIST_H +#define __ASM_GENERIC_SCATTERLIST_H -#include +#include struct scatterlist { #ifdef CONFIG_DEBUG_SG @@ -14,20 +14,30 @@ struct scatterlist { unsigned int dma_length; }; -#define ARCH_HAS_SG_CHAIN -#define ISA_DMA_THRESHOLD (0x00ffffff) - /* - * These macros should be used after a pci_map_sg call has been done + * These macros should be used after a dma_map_sg call has been done * to get bus addresses of each of the SG entries and their lengths. * You should only work with the number of sg entries pci_map_sg - * returns. + * returns, or alternatively stop on the first sg_dma_len(sg) which + * is 0. */ #define sg_dma_address(sg) ((sg)->dma_address) -#ifdef CONFIG_X86_32 -# define sg_dma_len(sg) ((sg)->length) +#ifndef sg_dma_len +/* + * Normally, you have an iommu on 64 bit machines, but not on 32 bit + * machines. Architectures that are differnt should override this. + */ +#if __BITS_PER_LONG == 64 +#define sg_dma_len(sg) ((sg)->dma_length) #else -# define sg_dma_len(sg) ((sg)->dma_length) +#define sg_dma_len(sg) ((sg)->length) +#endif /* 64 bit */ +#endif /* sg_dma_len */ + +#ifndef ISA_DMA_THRESHOLD +#define ISA_DMA_THRESHOLD (~0UL) #endif -#endif /* _ASM_X86_SCATTERLIST_H */ +#define ARCH_HAS_SG_CHAIN + +#endif /* __ASM_GENERIC_SCATTERLIST_H */ diff --git a/arch/x86/include/asm/generic-types.h b/arch/x86/include/asm/generic-types.h index 09b97745772f..fba7d33ca3f2 100644 --- a/arch/x86/include/asm/generic-types.h +++ b/arch/x86/include/asm/generic-types.h @@ -1,6 +1,9 @@ -#ifndef _ASM_X86_TYPES_H -#define _ASM_X86_TYPES_H - +#ifndef _ASM_GENERIC_TYPES_H +#define _ASM_GENERIC_TYPES_H +/* + * int-ll64 is used practically everywhere now, + * so use it as a reasonable default. + */ #include #ifndef __ASSEMBLY__ @@ -13,18 +16,27 @@ typedef unsigned short umode_t; * These aren't exported outside the kernel to avoid name space clashes */ #ifdef __KERNEL__ - #ifndef __ASSEMBLY__ - -typedef u64 dma64_addr_t; -#if defined(CONFIG_X86_64) || defined(CONFIG_HIGHMEM64G) -/* DMA addresses come in 32-bit and 64-bit flavours. */ +/* + * DMA addresses may be very different from physical addresses + * and pointers. i386 and powerpc may have 64 bit DMA on 32 bit + * systems, while sparc64 uses 32 bit DMA addresses for 64 bit + * physical addresses. + * This default defines dma_addr_t to have the same size as + * phys_addr_t, which is the most common way. + * Do not define the dma64_addr_t type, which never really + * worked. + */ +#ifndef dma_addr_t +#ifdef CONFIG_PHYS_ADDR_T_64BIT typedef u64 dma_addr_t; #else typedef u32 dma_addr_t; -#endif +#endif /* CONFIG_PHYS_ADDR_T_64BIT */ +#endif /* dma_addr_t */ #endif /* __ASSEMBLY__ */ + #endif /* __KERNEL__ */ -#endif /* _ASM_X86_TYPES_H */ +#endif /* _ASM_GENERIC_TYPES_H */ diff --git a/arch/x86/include/asm/generic-ucontext.h b/arch/x86/include/asm/generic-ucontext.h index 87324cf439d9..ad77343e8a9a 100644 --- a/arch/x86/include/asm/generic-ucontext.h +++ b/arch/x86/include/asm/generic-ucontext.h @@ -1,11 +1,5 @@ -#ifndef _ASM_X86_UCONTEXT_H -#define _ASM_X86_UCONTEXT_H - -#define UC_FP_XSTATE 0x1 /* indicates the presence of extended state - * information in the memory layout pointed - * by the fpstate pointer in the ucontext's - * sigcontext struct (uc_mcontext). - */ +#ifndef __ASM_GENERIC_UCONTEXT_H +#define __ASM_GENERIC_UCONTEXT_H struct ucontext { unsigned long uc_flags; @@ -15,4 +9,4 @@ struct ucontext { sigset_t uc_sigmask; /* mask last for extensibility */ }; -#endif /* _ASM_X86_UCONTEXT_H */ +#endif /* __ASM_GENERIC_UCONTEXT_H */ diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h index 751af2550ed9..063d8c9e4d60 100644 --- a/arch/x86/include/asm/mman.h +++ b/arch/x86/include/asm/mman.h @@ -1,20 +1,8 @@ #ifndef _ASM_X86_MMAN_H #define _ASM_X86_MMAN_H -#include - #define MAP_32BIT 0x40 /* only give out 32bit addresses */ -#define MAP_GROWSDOWN 0x0100 /* stack-like segment */ -#define MAP_DENYWRITE 0x0800 /* ETXTBSY */ -#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ -#define MAP_LOCKED 0x2000 /* pages are locked */ -#define MAP_NORESERVE 0x4000 /* don't check for reservations */ -#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ -#define MAP_NONBLOCK 0x10000 /* do not block on IO */ -#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ - -#define MCL_CURRENT 1 /* lock all current mappings */ -#define MCL_FUTURE 2 /* lock all future mappings */ +#include #endif /* _ASM_X86_MMAN_H */ diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index 47d62743c4d5..4a7a192910d0 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -1,18 +1,7 @@ #ifndef _ASM_X86_MODULE_H #define _ASM_X86_MODULE_H -/* x86_32/64 are simple */ -struct mod_arch_specific {}; - -#ifdef CONFIG_X86_32 -# define Elf_Shdr Elf32_Shdr -# define Elf_Sym Elf32_Sym -# define Elf_Ehdr Elf32_Ehdr -#else -# define Elf_Shdr Elf64_Shdr -# define Elf_Sym Elf64_Sym -# define Elf_Ehdr Elf64_Ehdr -#endif +#include #ifdef CONFIG_X86_64 /* X86_64 does not define MODULE_PROC_FAMILY */ diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h index 263d397d2eef..2097d686471d 100644 --- a/arch/x86/include/asm/scatterlist.h +++ b/arch/x86/include/asm/scatterlist.h @@ -1,33 +1,8 @@ #ifndef _ASM_X86_SCATTERLIST_H #define _ASM_X86_SCATTERLIST_H -#include - -struct scatterlist { -#ifdef CONFIG_DEBUG_SG - unsigned long sg_magic; -#endif - unsigned long page_link; - unsigned int offset; - unsigned int length; - dma_addr_t dma_address; - unsigned int dma_length; -}; - -#define ARCH_HAS_SG_CHAIN #define ISA_DMA_THRESHOLD (0x00ffffff) -/* - * These macros should be used after a pci_map_sg call has been done - * to get bus addresses of each of the SG entries and their lengths. - * You should only work with the number of sg entries pci_map_sg - * returns. - */ -#define sg_dma_address(sg) ((sg)->dma_address) -#ifdef CONFIG_X86_32 -# define sg_dma_len(sg) ((sg)->length) -#else -# define sg_dma_len(sg) ((sg)->dma_length) -#endif +#include #endif /* _ASM_X86_SCATTERLIST_H */ diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h index 09b97745772f..f2fe528e9682 100644 --- a/arch/x86/include/asm/types.h +++ b/arch/x86/include/asm/types.h @@ -1,19 +1,11 @@ #ifndef _ASM_X86_TYPES_H #define _ASM_X86_TYPES_H -#include +#define dma_addr_t dma_addr_t -#ifndef __ASSEMBLY__ - -typedef unsigned short umode_t; +#include -#endif /* __ASSEMBLY__ */ - -/* - * These aren't exported outside the kernel to avoid name space clashes - */ #ifdef __KERNEL__ - #ifndef __ASSEMBLY__ typedef u64 dma64_addr_t; diff --git a/arch/x86/include/asm/ucontext.h b/arch/x86/include/asm/ucontext.h index 87324cf439d9..7cfc436f86da 100644 --- a/arch/x86/include/asm/ucontext.h +++ b/arch/x86/include/asm/ucontext.h @@ -7,12 +7,6 @@ * sigcontext struct (uc_mcontext). */ -struct ucontext { - unsigned long uc_flags; - struct ucontext *uc_link; - stack_t uc_stack; - struct sigcontext uc_mcontext; - sigset_t uc_sigmask; /* mask last for extensibility */ -}; +#include #endif /* _ASM_X86_UCONTEXT_H */ -- cgit v1.2.3 From 69d5ffdaad7b77b97229b55c36afb20e5bebd29e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 18 Jun 2009 21:48:19 +0200 Subject: x86: convert termios.h to the asm-generic version This patch turned out more controversial than expected and may get dropped in the future. I'm including it for reference anyway. The user_termio_to_kernel_termios and kernel_termios_to_user_termio functions on x86 are lacking error checking from get_user and are not portable to big-endian systems, so the asm-generic header has to differ in this regard. Signed-off-by: Arnd Bergmann LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/termios.h | 86 +++++++++++++++++++++++++++++++----------- 1 file changed, 63 insertions(+), 23 deletions(-) diff --git a/arch/x86/include/asm/termios.h b/arch/x86/include/asm/termios.h index c4ee8056baca..d0922adc56d4 100644 --- a/arch/x86/include/asm/termios.h +++ b/arch/x86/include/asm/termios.h @@ -1,5 +1,12 @@ -#ifndef _ASM_X86_TERMIOS_H -#define _ASM_X86_TERMIOS_H +#ifndef _ASM_GENERIC_TERMIOS_H +#define _ASM_GENERIC_TERMIOS_H +/* + * Most architectures have straight copies of the x86 code, with + * varying levels of bug fixes on top. Usually it's a good idea + * to use this generic version instead, but be careful to avoid + * ABI changes. + * New architectures should not provide their own version. + */ #include #include @@ -54,37 +61,57 @@ struct termio { /* * Translate a "termio" structure into a "termios". Ugh. */ -#define SET_LOW_TERMIOS_BITS(termios, termio, x) { \ - unsigned short __tmp; \ - get_user(__tmp,&(termio)->x); \ - *(unsigned short *) &(termios)->x = __tmp; \ -} - static inline int user_termio_to_kernel_termios(struct ktermios *termios, - struct termio __user *termio) + const struct termio __user *termio) { - SET_LOW_TERMIOS_BITS(termios, termio, c_iflag); - SET_LOW_TERMIOS_BITS(termios, termio, c_oflag); - SET_LOW_TERMIOS_BITS(termios, termio, c_cflag); - SET_LOW_TERMIOS_BITS(termios, termio, c_lflag); - get_user(termios->c_line, &termio->c_line); - return copy_from_user(termios->c_cc, termio->c_cc, NCC); + unsigned short tmp; + + if (get_user(tmp, &termio->c_iflag) < 0) + goto fault; + termios->c_iflag = (0xffff0000 & termios->c_iflag) | tmp; + + if (get_user(tmp, &termio->c_oflag) < 0) + goto fault; + termios->c_oflag = (0xffff0000 & termios->c_oflag) | tmp; + + if (get_user(tmp, &termio->c_cflag) < 0) + goto fault; + termios->c_cflag = (0xffff0000 & termios->c_cflag) | tmp; + + if (get_user(tmp, &termio->c_lflag) < 0) + goto fault; + termios->c_lflag = (0xffff0000 & termios->c_lflag) | tmp; + + if (get_user(termios->c_line, &termio->c_line) < 0) + goto fault; + + if (copy_from_user(termios->c_cc, termio->c_cc, NCC) != 0) + goto fault; + + return 0; + + fault: + return -EFAULT; } /* * Translate a "termios" structure into a "termio". Ugh. */ static inline int kernel_termios_to_user_termio(struct termio __user *termio, - struct ktermios *termios) + struct ktermios *termios) { - put_user((termios)->c_iflag, &(termio)->c_iflag); - put_user((termios)->c_oflag, &(termio)->c_oflag); - put_user((termios)->c_cflag, &(termio)->c_cflag); - put_user((termios)->c_lflag, &(termio)->c_lflag); - put_user((termios)->c_line, &(termio)->c_line); - return copy_to_user((termio)->c_cc, (termios)->c_cc, NCC); + if (put_user(termios->c_iflag, &termio->c_iflag) < 0 || + put_user(termios->c_oflag, &termio->c_oflag) < 0 || + put_user(termios->c_cflag, &termio->c_cflag) < 0 || + put_user(termios->c_lflag, &termio->c_lflag) < 0 || + put_user(termios->c_line, &termio->c_line) < 0 || + copy_to_user(termio->c_cc, termios->c_cc, NCC) != 0) + return -EFAULT; + + return 0; } +#ifdef TCGETS2 static inline int user_termios_to_kernel_termios(struct ktermios *k, struct termios2 __user *u) { @@ -108,7 +135,20 @@ static inline int kernel_termios_to_user_termios_1(struct termios __user *u, { return copy_to_user(u, k, sizeof(struct termios)); } +#else /* TCGETS2 */ +static inline int user_termios_to_kernel_termios(struct ktermios *k, + struct termios __user *u) +{ + return copy_from_user(k, u, sizeof(struct termios)); +} + +static inline int kernel_termios_to_user_termios(struct termios __user *u, + struct ktermios *k) +{ + return copy_to_user(u, k, sizeof(struct termios)); +} +#endif /* TCGETS2 */ #endif /* __KERNEL__ */ -#endif /* _ASM_X86_TERMIOS_H */ +#endif /* _ASM_GENERIC_TERMIOS_H */ -- cgit v1.2.3 From 73a2d096fdf23aa841f7595d114a11ec85a85e4d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 18 Jun 2009 21:48:20 +0200 Subject: x86: remove all now-duplicate header files All files that have been made identical to the asm-generic version in the previous patches can now be removed, guaranteeing that this does not introduce semantic changes. Signed-off-by: Arnd Bergmann LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/generic-mman.h | 18 --- arch/x86/include/asm/generic-module.h | 22 ---- arch/x86/include/asm/generic-scatterlist.h | 43 ------- arch/x86/include/asm/generic-types.h | 42 ------ arch/x86/include/asm/generic-ucontext.h | 12 -- arch/x86/include/asm/ioctls.h | 111 +--------------- arch/x86/include/asm/ipcbuf.h | 34 +---- arch/x86/include/asm/mman.h | 2 +- arch/x86/include/asm/module.h | 2 +- arch/x86/include/asm/msgbuf.h | 48 +------ arch/x86/include/asm/param.h | 25 +--- arch/x86/include/asm/scatterlist.h | 2 +- arch/x86/include/asm/shmbuf.h | 60 +-------- arch/x86/include/asm/socket.h | 61 +-------- arch/x86/include/asm/sockios.h | 14 +- arch/x86/include/asm/termbits.h | 199 +---------------------------- arch/x86/include/asm/termios.h | 155 +--------------------- arch/x86/include/asm/types.h | 2 +- arch/x86/include/asm/ucontext.h | 2 +- 19 files changed, 14 insertions(+), 840 deletions(-) delete mode 100644 arch/x86/include/asm/generic-mman.h delete mode 100644 arch/x86/include/asm/generic-module.h delete mode 100644 arch/x86/include/asm/generic-scatterlist.h delete mode 100644 arch/x86/include/asm/generic-types.h delete mode 100644 arch/x86/include/asm/generic-ucontext.h diff --git a/arch/x86/include/asm/generic-mman.h b/arch/x86/include/asm/generic-mman.h deleted file mode 100644 index 7cab4de2bca6..000000000000 --- a/arch/x86/include/asm/generic-mman.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef __ASM_GENERIC_MMAN_H -#define __ASM_GENERIC_MMAN_H - -#include - -#define MAP_GROWSDOWN 0x0100 /* stack-like segment */ -#define MAP_DENYWRITE 0x0800 /* ETXTBSY */ -#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ -#define MAP_LOCKED 0x2000 /* pages are locked */ -#define MAP_NORESERVE 0x4000 /* don't check for reservations */ -#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ -#define MAP_NONBLOCK 0x10000 /* do not block on IO */ -#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ - -#define MCL_CURRENT 1 /* lock all current mappings */ -#define MCL_FUTURE 2 /* lock all future mappings */ - -#endif /* __ASM_GENERIC_MMAN_H */ diff --git a/arch/x86/include/asm/generic-module.h b/arch/x86/include/asm/generic-module.h deleted file mode 100644 index ed5b44de4c91..000000000000 --- a/arch/x86/include/asm/generic-module.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef __ASM_GENERIC_MODULE_H -#define __ASM_GENERIC_MODULE_H - -/* - * Many architectures just need a simple module - * loader without arch specific data. - */ -struct mod_arch_specific -{ -}; - -#ifdef CONFIG_64BIT -#define Elf_Shdr Elf64_Shdr -#define Elf_Sym Elf64_Sym -#define Elf_Ehdr Elf64_Ehdr -#else -#define Elf_Shdr Elf32_Shdr -#define Elf_Sym Elf32_Sym -#define Elf_Ehdr Elf32_Ehdr -#endif - -#endif /* __ASM_GENERIC_MODULE_H */ diff --git a/arch/x86/include/asm/generic-scatterlist.h b/arch/x86/include/asm/generic-scatterlist.h deleted file mode 100644 index 8b9454496a7c..000000000000 --- a/arch/x86/include/asm/generic-scatterlist.h +++ /dev/null @@ -1,43 +0,0 @@ -#ifndef __ASM_GENERIC_SCATTERLIST_H -#define __ASM_GENERIC_SCATTERLIST_H - -#include - -struct scatterlist { -#ifdef CONFIG_DEBUG_SG - unsigned long sg_magic; -#endif - unsigned long page_link; - unsigned int offset; - unsigned int length; - dma_addr_t dma_address; - unsigned int dma_length; -}; - -/* - * These macros should be used after a dma_map_sg call has been done - * to get bus addresses of each of the SG entries and their lengths. - * You should only work with the number of sg entries pci_map_sg - * returns, or alternatively stop on the first sg_dma_len(sg) which - * is 0. - */ -#define sg_dma_address(sg) ((sg)->dma_address) -#ifndef sg_dma_len -/* - * Normally, you have an iommu on 64 bit machines, but not on 32 bit - * machines. Architectures that are differnt should override this. - */ -#if __BITS_PER_LONG == 64 -#define sg_dma_len(sg) ((sg)->dma_length) -#else -#define sg_dma_len(sg) ((sg)->length) -#endif /* 64 bit */ -#endif /* sg_dma_len */ - -#ifndef ISA_DMA_THRESHOLD -#define ISA_DMA_THRESHOLD (~0UL) -#endif - -#define ARCH_HAS_SG_CHAIN - -#endif /* __ASM_GENERIC_SCATTERLIST_H */ diff --git a/arch/x86/include/asm/generic-types.h b/arch/x86/include/asm/generic-types.h deleted file mode 100644 index fba7d33ca3f2..000000000000 --- a/arch/x86/include/asm/generic-types.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef _ASM_GENERIC_TYPES_H -#define _ASM_GENERIC_TYPES_H -/* - * int-ll64 is used practically everywhere now, - * so use it as a reasonable default. - */ -#include - -#ifndef __ASSEMBLY__ - -typedef unsigned short umode_t; - -#endif /* __ASSEMBLY__ */ - -/* - * These aren't exported outside the kernel to avoid name space clashes - */ -#ifdef __KERNEL__ -#ifndef __ASSEMBLY__ -/* - * DMA addresses may be very different from physical addresses - * and pointers. i386 and powerpc may have 64 bit DMA on 32 bit - * systems, while sparc64 uses 32 bit DMA addresses for 64 bit - * physical addresses. - * This default defines dma_addr_t to have the same size as - * phys_addr_t, which is the most common way. - * Do not define the dma64_addr_t type, which never really - * worked. - */ -#ifndef dma_addr_t -#ifdef CONFIG_PHYS_ADDR_T_64BIT -typedef u64 dma_addr_t; -#else -typedef u32 dma_addr_t; -#endif /* CONFIG_PHYS_ADDR_T_64BIT */ -#endif /* dma_addr_t */ - -#endif /* __ASSEMBLY__ */ - -#endif /* __KERNEL__ */ - -#endif /* _ASM_GENERIC_TYPES_H */ diff --git a/arch/x86/include/asm/generic-ucontext.h b/arch/x86/include/asm/generic-ucontext.h deleted file mode 100644 index ad77343e8a9a..000000000000 --- a/arch/x86/include/asm/generic-ucontext.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef __ASM_GENERIC_UCONTEXT_H -#define __ASM_GENERIC_UCONTEXT_H - -struct ucontext { - unsigned long uc_flags; - struct ucontext *uc_link; - stack_t uc_stack; - struct sigcontext uc_mcontext; - sigset_t uc_sigmask; /* mask last for extensibility */ -}; - -#endif /* __ASM_GENERIC_UCONTEXT_H */ diff --git a/arch/x86/include/asm/ioctls.h b/arch/x86/include/asm/ioctls.h index a799e20a769e..ec34c760665e 100644 --- a/arch/x86/include/asm/ioctls.h +++ b/arch/x86/include/asm/ioctls.h @@ -1,110 +1 @@ -#ifndef __ASM_GENERIC_IOCTLS_H -#define __ASM_GENERIC_IOCTLS_H - -#include - -/* - * These are the most common definitions for tty ioctl numbers. - * Most of them do not use the recommended _IOC(), but there is - * probably some source code out there hardcoding the number, - * so we might as well use them for all new platforms. - * - * The architectures that use different values here typically - * try to be compatible with some Unix variants for the same - * architecture. - */ - -/* 0x54 is just a magic number to make these relatively unique ('T') */ - -#define TCGETS 0x5401 -#define TCSETS 0x5402 -#define TCSETSW 0x5403 -#define TCSETSF 0x5404 -#define TCGETA 0x5405 -#define TCSETA 0x5406 -#define TCSETAW 0x5407 -#define TCSETAF 0x5408 -#define TCSBRK 0x5409 -#define TCXONC 0x540A -#define TCFLSH 0x540B -#define TIOCEXCL 0x540C -#define TIOCNXCL 0x540D -#define TIOCSCTTY 0x540E -#define TIOCGPGRP 0x540F -#define TIOCSPGRP 0x5410 -#define TIOCOUTQ 0x5411 -#define TIOCSTI 0x5412 -#define TIOCGWINSZ 0x5413 -#define TIOCSWINSZ 0x5414 -#define TIOCMGET 0x5415 -#define TIOCMBIS 0x5416 -#define TIOCMBIC 0x5417 -#define TIOCMSET 0x5418 -#define TIOCGSOFTCAR 0x5419 -#define TIOCSSOFTCAR 0x541A -#define FIONREAD 0x541B -#define TIOCINQ FIONREAD -#define TIOCLINUX 0x541C -#define TIOCCONS 0x541D -#define TIOCGSERIAL 0x541E -#define TIOCSSERIAL 0x541F -#define TIOCPKT 0x5420 -#define FIONBIO 0x5421 -#define TIOCNOTTY 0x5422 -#define TIOCSETD 0x5423 -#define TIOCGETD 0x5424 -#define TCSBRKP 0x5425 /* Needed for POSIX tcsendbreak() */ -#define TIOCSBRK 0x5427 /* BSD compatibility */ -#define TIOCCBRK 0x5428 /* BSD compatibility */ -#define TIOCGSID 0x5429 /* Return the session ID of FD */ -#define TCGETS2 _IOR('T', 0x2A, struct termios2) -#define TCSETS2 _IOW('T', 0x2B, struct termios2) -#define TCSETSW2 _IOW('T', 0x2C, struct termios2) -#define TCSETSF2 _IOW('T', 0x2D, struct termios2) -#define TIOCGRS485 0x542E -#define TIOCSRS485 0x542F -#define TIOCGPTN _IOR('T', 0x30, unsigned int) /* Get Pty Number (of pty-mux device) */ -#define TIOCSPTLCK _IOW('T', 0x31, int) /* Lock/unlock Pty */ -#define TCGETX 0x5432 /* SYS5 TCGETX compatibility */ -#define TCSETX 0x5433 -#define TCSETXF 0x5434 -#define TCSETXW 0x5435 - -#define FIONCLEX 0x5450 -#define FIOCLEX 0x5451 -#define FIOASYNC 0x5452 -#define TIOCSERCONFIG 0x5453 -#define TIOCSERGWILD 0x5454 -#define TIOCSERSWILD 0x5455 -#define TIOCGLCKTRMIOS 0x5456 -#define TIOCSLCKTRMIOS 0x5457 -#define TIOCSERGSTRUCT 0x5458 /* For debugging only */ -#define TIOCSERGETLSR 0x5459 /* Get line status register */ -#define TIOCSERGETMULTI 0x545A /* Get multiport config */ -#define TIOCSERSETMULTI 0x545B /* Set multiport config */ - -#define TIOCMIWAIT 0x545C /* wait for a change on serial input line(s) */ -#define TIOCGICOUNT 0x545D /* read serial port inline interrupt counts */ - -/* - * some architectures define FIOQSIZE as 0x545E, which is used for - * TIOCGHAYESESP on others - */ -#ifndef FIOQSIZE -# define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */ -# define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */ -# define FIOQSIZE 0x5460 -#endif - -/* Used for packet mode */ -#define TIOCPKT_DATA 0 -#define TIOCPKT_FLUSHREAD 1 -#define TIOCPKT_FLUSHWRITE 2 -#define TIOCPKT_STOP 4 -#define TIOCPKT_START 8 -#define TIOCPKT_NOSTOP 16 -#define TIOCPKT_DOSTOP 32 - -#define TIOCSER_TEMT 0x01 /* Transmitter physically empty */ - -#endif /* __ASM_GENERIC_IOCTLS_H */ +#include diff --git a/arch/x86/include/asm/ipcbuf.h b/arch/x86/include/asm/ipcbuf.h index 888ca1c8f951..84c7e51cb6d0 100644 --- a/arch/x86/include/asm/ipcbuf.h +++ b/arch/x86/include/asm/ipcbuf.h @@ -1,33 +1 @@ -#ifndef __ASM_GENERIC_IPCBUF_H -#define __ASM_GENERIC_IPCBUF_H - -/* - * The generic ipc64_perm structure: - * Note extra padding because this structure is passed back and forth - * between kernel and user space. - * - * ipc64_perm was originally meant to be architecture specific, but - * everyone just ended up making identical copies without specific - * optimizations, so we may just as well all use the same one. - * - * Pad space is left for: - * - 32-bit mode_t on architectures that only had 16 bit - * - 32-bit seq - * - 2 miscellaneous 32-bit values - */ - -struct ipc64_perm { - __kernel_key_t key; - __kernel_uid32_t uid; - __kernel_gid32_t gid; - __kernel_uid32_t cuid; - __kernel_gid32_t cgid; - __kernel_mode_t mode; - unsigned short __pad1; - unsigned short seq; - unsigned short __pad2; - unsigned long __unused1; - unsigned long __unused2; -}; - -#endif /* __ASM_GENERIC_IPCBUF_H */ +#include diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h index 063d8c9e4d60..593e51d4643f 100644 --- a/arch/x86/include/asm/mman.h +++ b/arch/x86/include/asm/mman.h @@ -3,6 +3,6 @@ #define MAP_32BIT 0x40 /* only give out 32bit addresses */ -#include +#include #endif /* _ASM_X86_MMAN_H */ diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index 4a7a192910d0..555bc12bdcd6 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -1,7 +1,7 @@ #ifndef _ASM_X86_MODULE_H #define _ASM_X86_MODULE_H -#include +#include #ifdef CONFIG_X86_64 /* X86_64 does not define MODULE_PROC_FAMILY */ diff --git a/arch/x86/include/asm/msgbuf.h b/arch/x86/include/asm/msgbuf.h index aec850d9159e..809134c644a6 100644 --- a/arch/x86/include/asm/msgbuf.h +++ b/arch/x86/include/asm/msgbuf.h @@ -1,47 +1 @@ -#ifndef __ASM_GENERIC_MSGBUF_H -#define __ASM_GENERIC_MSGBUF_H - -#include -/* - * generic msqid64_ds structure. - * - * Note extra padding because this structure is passed back and forth - * between kernel and user space. - * - * msqid64_ds was originally meant to be architecture specific, but - * everyone just ended up making identical copies without specific - * optimizations, so we may just as well all use the same one. - * - * 64 bit architectures typically define a 64 bit __kernel_time_t, - * so they do not need the first three padding words. - * On big-endian systems, the padding is in the wrong place. - * - * Pad space is left for: - * - 64-bit time_t to solve y2038 problem - * - 2 miscellaneous 32-bit values - */ - -struct msqid64_ds { - struct ipc64_perm msg_perm; - __kernel_time_t msg_stime; /* last msgsnd time */ -#if __BITS_PER_LONG != 64 - unsigned long __unused1; -#endif - __kernel_time_t msg_rtime; /* last msgrcv time */ -#if __BITS_PER_LONG != 64 - unsigned long __unused2; -#endif - __kernel_time_t msg_ctime; /* last change time */ -#if __BITS_PER_LONG != 64 - unsigned long __unused3; -#endif - unsigned long msg_cbytes; /* current number of bytes on queue */ - unsigned long msg_qnum; /* number of messages in queue */ - unsigned long msg_qbytes; /* max number of bytes on queue */ - __kernel_pid_t msg_lspid; /* pid of last msgsnd */ - __kernel_pid_t msg_lrpid; /* last receive pid */ - unsigned long __unused4; - unsigned long __unused5; -}; - -#endif /* __ASM_GENERIC_MSGBUF_H */ +#include diff --git a/arch/x86/include/asm/param.h b/arch/x86/include/asm/param.h index cdf8251bfb6c..965d45427975 100644 --- a/arch/x86/include/asm/param.h +++ b/arch/x86/include/asm/param.h @@ -1,24 +1 @@ -#ifndef __ASM_GENERIC_PARAM_H -#define __ASM_GENERIC_PARAM_H - -#ifdef __KERNEL__ -# define HZ CONFIG_HZ /* Internal kernel timer frequency */ -# define USER_HZ 100 /* some user interfaces are */ -# define CLOCKS_PER_SEC (USER_HZ) /* in "ticks" like times() */ -#endif - -#ifndef HZ -#define HZ 100 -#endif - -#ifndef EXEC_PAGESIZE -#define EXEC_PAGESIZE 4096 -#endif - -#ifndef NOGROUP -#define NOGROUP (-1) -#endif - -#define MAXHOSTNAMELEN 64 /* max length of hostname */ - -#endif /* __ASM_GENERIC_PARAM_H */ +#include diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h index 2097d686471d..75af592677ec 100644 --- a/arch/x86/include/asm/scatterlist.h +++ b/arch/x86/include/asm/scatterlist.h @@ -3,6 +3,6 @@ #define ISA_DMA_THRESHOLD (0x00ffffff) -#include +#include #endif /* _ASM_X86_SCATTERLIST_H */ diff --git a/arch/x86/include/asm/shmbuf.h b/arch/x86/include/asm/shmbuf.h index 5768fa60ac82..83c05fc2de38 100644 --- a/arch/x86/include/asm/shmbuf.h +++ b/arch/x86/include/asm/shmbuf.h @@ -1,59 +1 @@ -#ifndef __ASM_GENERIC_SHMBUF_H -#define __ASM_GENERIC_SHMBUF_H - -#include - -/* - * The shmid64_ds structure for x86 architecture. - * Note extra padding because this structure is passed back and forth - * between kernel and user space. - * - * shmid64_ds was originally meant to be architecture specific, but - * everyone just ended up making identical copies without specific - * optimizations, so we may just as well all use the same one. - * - * 64 bit architectures typically define a 64 bit __kernel_time_t, - * so they do not need the first two padding words. - * On big-endian systems, the padding is in the wrong place. - * - * - * Pad space is left for: - * - 64-bit time_t to solve y2038 problem - * - 2 miscellaneous 32-bit values - */ - -struct shmid64_ds { - struct ipc64_perm shm_perm; /* operation perms */ - size_t shm_segsz; /* size of segment (bytes) */ - __kernel_time_t shm_atime; /* last attach time */ -#if __BITS_PER_LONG != 64 - unsigned long __unused1; -#endif - __kernel_time_t shm_dtime; /* last detach time */ -#if __BITS_PER_LONG != 64 - unsigned long __unused2; -#endif - __kernel_time_t shm_ctime; /* last change time */ -#if __BITS_PER_LONG != 64 - unsigned long __unused3; -#endif - __kernel_pid_t shm_cpid; /* pid of creator */ - __kernel_pid_t shm_lpid; /* pid of last operator */ - unsigned long shm_nattch; /* no. of current attaches */ - unsigned long __unused4; - unsigned long __unused5; -}; - -struct shminfo64 { - unsigned long shmmax; - unsigned long shmmin; - unsigned long shmmni; - unsigned long shmseg; - unsigned long shmall; - unsigned long __unused1; - unsigned long __unused2; - unsigned long __unused3; - unsigned long __unused4; -}; - -#endif /* __ASM_GENERIC_SHMBUF_H */ +#include diff --git a/arch/x86/include/asm/socket.h b/arch/x86/include/asm/socket.h index d4ae42a06a20..6b71384b9d8b 100644 --- a/arch/x86/include/asm/socket.h +++ b/arch/x86/include/asm/socket.h @@ -1,60 +1 @@ -#ifndef __ASM_GENERIC_SOCKET_H -#define __ASM_GENERIC_SOCKET_H - -#include - -/* For setsockopt(2) */ -#define SOL_SOCKET 1 - -#define SO_DEBUG 1 -#define SO_REUSEADDR 2 -#define SO_TYPE 3 -#define SO_ERROR 4 -#define SO_DONTROUTE 5 -#define SO_BROADCAST 6 -#define SO_SNDBUF 7 -#define SO_RCVBUF 8 -#define SO_SNDBUFFORCE 32 -#define SO_RCVBUFFORCE 33 -#define SO_KEEPALIVE 9 -#define SO_OOBINLINE 10 -#define SO_NO_CHECK 11 -#define SO_PRIORITY 12 -#define SO_LINGER 13 -#define SO_BSDCOMPAT 14 -/* To add :#define SO_REUSEPORT 15 */ -#define SO_PASSCRED 16 -#define SO_PEERCRED 17 -#define SO_RCVLOWAT 18 -#define SO_SNDLOWAT 19 -#define SO_RCVTIMEO 20 -#define SO_SNDTIMEO 21 - -/* Security levels - as per NRL IPv6 - don't actually do anything */ -#define SO_SECURITY_AUTHENTICATION 22 -#define SO_SECURITY_ENCRYPTION_TRANSPORT 23 -#define SO_SECURITY_ENCRYPTION_NETWORK 24 - -#define SO_BINDTODEVICE 25 - -/* Socket filtering */ -#define SO_ATTACH_FILTER 26 -#define SO_DETACH_FILTER 27 - -#define SO_PEERNAME 28 -#define SO_TIMESTAMP 29 -#define SCM_TIMESTAMP SO_TIMESTAMP - -#define SO_ACCEPTCONN 30 - -#define SO_PEERSEC 31 -#define SO_PASSSEC 34 -#define SO_TIMESTAMPNS 35 -#define SCM_TIMESTAMPNS SO_TIMESTAMPNS - -#define SO_MARK 36 - -#define SO_TIMESTAMPING 37 -#define SCM_TIMESTAMPING SO_TIMESTAMPING - -#endif /* __ASM_GENERIC_SOCKET_H */ +#include diff --git a/arch/x86/include/asm/sockios.h b/arch/x86/include/asm/sockios.h index 9a61a369b901..def6d4746ee7 100644 --- a/arch/x86/include/asm/sockios.h +++ b/arch/x86/include/asm/sockios.h @@ -1,13 +1 @@ -#ifndef __ASM_GENERIC_SOCKIOS_H -#define __ASM_GENERIC_SOCKIOS_H - -/* Socket-level I/O control calls. */ -#define FIOSETOWN 0x8901 -#define SIOCSPGRP 0x8902 -#define FIOGETOWN 0x8903 -#define SIOCGPGRP 0x8904 -#define SIOCATMARK 0x8905 -#define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */ -#define SIOCGSTAMPNS 0x8907 /* Get stamp (timespec) */ - -#endif /* __ASM_GENERIC_SOCKIOS_H */ +#include diff --git a/arch/x86/include/asm/termbits.h b/arch/x86/include/asm/termbits.h index 1c9773d48cb0..3935b106de79 100644 --- a/arch/x86/include/asm/termbits.h +++ b/arch/x86/include/asm/termbits.h @@ -1,198 +1 @@ -#ifndef __ASM_GENERIC_TERMBITS_H -#define __ASM_GENERIC_TERMBITS_H - -#include - -typedef unsigned char cc_t; -typedef unsigned int speed_t; -typedef unsigned int tcflag_t; - -#define NCCS 19 -struct termios { - tcflag_t c_iflag; /* input mode flags */ - tcflag_t c_oflag; /* output mode flags */ - tcflag_t c_cflag; /* control mode flags */ - tcflag_t c_lflag; /* local mode flags */ - cc_t c_line; /* line discipline */ - cc_t c_cc[NCCS]; /* control characters */ -}; - -struct termios2 { - tcflag_t c_iflag; /* input mode flags */ - tcflag_t c_oflag; /* output mode flags */ - tcflag_t c_cflag; /* control mode flags */ - tcflag_t c_lflag; /* local mode flags */ - cc_t c_line; /* line discipline */ - cc_t c_cc[NCCS]; /* control characters */ - speed_t c_ispeed; /* input speed */ - speed_t c_ospeed; /* output speed */ -}; - -struct ktermios { - tcflag_t c_iflag; /* input mode flags */ - tcflag_t c_oflag; /* output mode flags */ - tcflag_t c_cflag; /* control mode flags */ - tcflag_t c_lflag; /* local mode flags */ - cc_t c_line; /* line discipline */ - cc_t c_cc[NCCS]; /* control characters */ - speed_t c_ispeed; /* input speed */ - speed_t c_ospeed; /* output speed */ -}; - -/* c_cc characters */ -#define VINTR 0 -#define VQUIT 1 -#define VERASE 2 -#define VKILL 3 -#define VEOF 4 -#define VTIME 5 -#define VMIN 6 -#define VSWTC 7 -#define VSTART 8 -#define VSTOP 9 -#define VSUSP 10 -#define VEOL 11 -#define VREPRINT 12 -#define VDISCARD 13 -#define VWERASE 14 -#define VLNEXT 15 -#define VEOL2 16 - -/* c_iflag bits */ -#define IGNBRK 0000001 -#define BRKINT 0000002 -#define IGNPAR 0000004 -#define PARMRK 0000010 -#define INPCK 0000020 -#define ISTRIP 0000040 -#define INLCR 0000100 -#define IGNCR 0000200 -#define ICRNL 0000400 -#define IUCLC 0001000 -#define IXON 0002000 -#define IXANY 0004000 -#define IXOFF 0010000 -#define IMAXBEL 0020000 -#define IUTF8 0040000 - -/* c_oflag bits */ -#define OPOST 0000001 -#define OLCUC 0000002 -#define ONLCR 0000004 -#define OCRNL 0000010 -#define ONOCR 0000020 -#define ONLRET 0000040 -#define OFILL 0000100 -#define OFDEL 0000200 -#define NLDLY 0000400 -#define NL0 0000000 -#define NL1 0000400 -#define CRDLY 0003000 -#define CR0 0000000 -#define CR1 0001000 -#define CR2 0002000 -#define CR3 0003000 -#define TABDLY 0014000 -#define TAB0 0000000 -#define TAB1 0004000 -#define TAB2 0010000 -#define TAB3 0014000 -#define XTABS 0014000 -#define BSDLY 0020000 -#define BS0 0000000 -#define BS1 0020000 -#define VTDLY 0040000 -#define VT0 0000000 -#define VT1 0040000 -#define FFDLY 0100000 -#define FF0 0000000 -#define FF1 0100000 - -/* c_cflag bit meaning */ -#define CBAUD 0010017 -#define B0 0000000 /* hang up */ -#define B50 0000001 -#define B75 0000002 -#define B110 0000003 -#define B134 0000004 -#define B150 0000005 -#define B200 0000006 -#define B300 0000007 -#define B600 0000010 -#define B1200 0000011 -#define B1800 0000012 -#define B2400 0000013 -#define B4800 0000014 -#define B9600 0000015 -#define B19200 0000016 -#define B38400 0000017 -#define EXTA B19200 -#define EXTB B38400 -#define CSIZE 0000060 -#define CS5 0000000 -#define CS6 0000020 -#define CS7 0000040 -#define CS8 0000060 -#define CSTOPB 0000100 -#define CREAD 0000200 -#define PARENB 0000400 -#define PARODD 0001000 -#define HUPCL 0002000 -#define CLOCAL 0004000 -#define CBAUDEX 0010000 -#define BOTHER 0010000 -#define B57600 0010001 -#define B115200 0010002 -#define B230400 0010003 -#define B460800 0010004 -#define B500000 0010005 -#define B576000 0010006 -#define B921600 0010007 -#define B1000000 0010010 -#define B1152000 0010011 -#define B1500000 0010012 -#define B2000000 0010013 -#define B2500000 0010014 -#define B3000000 0010015 -#define B3500000 0010016 -#define B4000000 0010017 -#define CIBAUD 002003600000 /* input baud rate */ -#define CMSPAR 010000000000 /* mark or space (stick) parity */ -#define CRTSCTS 020000000000 /* flow control */ - -#define IBSHIFT 16 /* Shift from CBAUD to CIBAUD */ - -/* c_lflag bits */ -#define ISIG 0000001 -#define ICANON 0000002 -#define XCASE 0000004 -#define ECHO 0000010 -#define ECHOE 0000020 -#define ECHOK 0000040 -#define ECHONL 0000100 -#define NOFLSH 0000200 -#define TOSTOP 0000400 -#define ECHOCTL 0001000 -#define ECHOPRT 0002000 -#define ECHOKE 0004000 -#define FLUSHO 0010000 -#define PENDIN 0040000 -#define IEXTEN 0100000 - -/* tcflow() and TCXONC use these */ -#define TCOOFF 0 -#define TCOON 1 -#define TCIOFF 2 -#define TCION 3 - -/* tcflush() and TCFLSH use these */ -#define TCIFLUSH 0 -#define TCOFLUSH 1 -#define TCIOFLUSH 2 - -/* tcsetattr uses these */ -#define TCSANOW 0 -#define TCSADRAIN 1 -#define TCSAFLUSH 2 - -#endif /* __ASM_GENERIC_TERMBITS_H */ +#include diff --git a/arch/x86/include/asm/termios.h b/arch/x86/include/asm/termios.h index d0922adc56d4..280d78a9d966 100644 --- a/arch/x86/include/asm/termios.h +++ b/arch/x86/include/asm/termios.h @@ -1,154 +1 @@ -#ifndef _ASM_GENERIC_TERMIOS_H -#define _ASM_GENERIC_TERMIOS_H -/* - * Most architectures have straight copies of the x86 code, with - * varying levels of bug fixes on top. Usually it's a good idea - * to use this generic version instead, but be careful to avoid - * ABI changes. - * New architectures should not provide their own version. - */ - -#include -#include - -struct winsize { - unsigned short ws_row; - unsigned short ws_col; - unsigned short ws_xpixel; - unsigned short ws_ypixel; -}; - -#define NCC 8 -struct termio { - unsigned short c_iflag; /* input mode flags */ - unsigned short c_oflag; /* output mode flags */ - unsigned short c_cflag; /* control mode flags */ - unsigned short c_lflag; /* local mode flags */ - unsigned char c_line; /* line discipline */ - unsigned char c_cc[NCC]; /* control characters */ -}; - -/* modem lines */ -#define TIOCM_LE 0x001 -#define TIOCM_DTR 0x002 -#define TIOCM_RTS 0x004 -#define TIOCM_ST 0x008 -#define TIOCM_SR 0x010 -#define TIOCM_CTS 0x020 -#define TIOCM_CAR 0x040 -#define TIOCM_RNG 0x080 -#define TIOCM_DSR 0x100 -#define TIOCM_CD TIOCM_CAR -#define TIOCM_RI TIOCM_RNG -#define TIOCM_OUT1 0x2000 -#define TIOCM_OUT2 0x4000 -#define TIOCM_LOOP 0x8000 - -/* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */ - -#ifdef __KERNEL__ - -#include - -/* intr=^C quit=^\ erase=del kill=^U - eof=^D vtime=\0 vmin=\1 sxtc=\0 - start=^Q stop=^S susp=^Z eol=\0 - reprint=^R discard=^U werase=^W lnext=^V - eol2=\0 -*/ -#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0" - -/* - * Translate a "termio" structure into a "termios". Ugh. - */ -static inline int user_termio_to_kernel_termios(struct ktermios *termios, - const struct termio __user *termio) -{ - unsigned short tmp; - - if (get_user(tmp, &termio->c_iflag) < 0) - goto fault; - termios->c_iflag = (0xffff0000 & termios->c_iflag) | tmp; - - if (get_user(tmp, &termio->c_oflag) < 0) - goto fault; - termios->c_oflag = (0xffff0000 & termios->c_oflag) | tmp; - - if (get_user(tmp, &termio->c_cflag) < 0) - goto fault; - termios->c_cflag = (0xffff0000 & termios->c_cflag) | tmp; - - if (get_user(tmp, &termio->c_lflag) < 0) - goto fault; - termios->c_lflag = (0xffff0000 & termios->c_lflag) | tmp; - - if (get_user(termios->c_line, &termio->c_line) < 0) - goto fault; - - if (copy_from_user(termios->c_cc, termio->c_cc, NCC) != 0) - goto fault; - - return 0; - - fault: - return -EFAULT; -} - -/* - * Translate a "termios" structure into a "termio". Ugh. - */ -static inline int kernel_termios_to_user_termio(struct termio __user *termio, - struct ktermios *termios) -{ - if (put_user(termios->c_iflag, &termio->c_iflag) < 0 || - put_user(termios->c_oflag, &termio->c_oflag) < 0 || - put_user(termios->c_cflag, &termio->c_cflag) < 0 || - put_user(termios->c_lflag, &termio->c_lflag) < 0 || - put_user(termios->c_line, &termio->c_line) < 0 || - copy_to_user(termio->c_cc, termios->c_cc, NCC) != 0) - return -EFAULT; - - return 0; -} - -#ifdef TCGETS2 -static inline int user_termios_to_kernel_termios(struct ktermios *k, - struct termios2 __user *u) -{ - return copy_from_user(k, u, sizeof(struct termios2)); -} - -static inline int kernel_termios_to_user_termios(struct termios2 __user *u, - struct ktermios *k) -{ - return copy_to_user(u, k, sizeof(struct termios2)); -} - -static inline int user_termios_to_kernel_termios_1(struct ktermios *k, - struct termios __user *u) -{ - return copy_from_user(k, u, sizeof(struct termios)); -} - -static inline int kernel_termios_to_user_termios_1(struct termios __user *u, - struct ktermios *k) -{ - return copy_to_user(u, k, sizeof(struct termios)); -} -#else /* TCGETS2 */ -static inline int user_termios_to_kernel_termios(struct ktermios *k, - struct termios __user *u) -{ - return copy_from_user(k, u, sizeof(struct termios)); -} - -static inline int kernel_termios_to_user_termios(struct termios __user *u, - struct ktermios *k) -{ - return copy_to_user(u, k, sizeof(struct termios)); -} -#endif /* TCGETS2 */ - -#endif /* __KERNEL__ */ - -#endif /* _ASM_GENERIC_TERMIOS_H */ +#include diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h index f2fe528e9682..df1da20f4534 100644 --- a/arch/x86/include/asm/types.h +++ b/arch/x86/include/asm/types.h @@ -3,7 +3,7 @@ #define dma_addr_t dma_addr_t -#include +#include #ifdef __KERNEL__ #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/ucontext.h b/arch/x86/include/asm/ucontext.h index 7cfc436f86da..b7c29c8017f2 100644 --- a/arch/x86/include/asm/ucontext.h +++ b/arch/x86/include/asm/ucontext.h @@ -7,6 +7,6 @@ * sigcontext struct (uc_mcontext). */ -#include +#include #endif /* _ASM_X86_UCONTEXT_H */ -- cgit v1.2.3 From feaa0457ec8351cae855edc9a3052ac49322538e Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 20 Jun 2009 16:15:40 +0530 Subject: x86: ds.c fix invalid assignment Fixes the type mixups that cause the following sparse warnings: CHECK arch/x86/kernel/ds.c arch/x86/kernel/ds.c:549:19: warning: incorrect type in argument 2 (invalid types) arch/x86/kernel/ds.c:549:19: expected bad type enum bts_field field arch/x86/kernel/ds.c:549:19: got int arch/x86/kernel/ds.c:514:35: error: incompatible types for operation (*) arch/x86/kernel/ds.c:514:35: left side has type unsigned char static [unsigned] [toplevel] sizeof_ptr_field arch/x86/kernel/ds.c:514:35: right side has type bad type enum bts_field field arch/x86/kernel/ds.c:514:7: error: invalid assignment arch/x86/kernel/ds.c:514:35: error: incompatible types for operation (*) arch/x86/kernel/ds.c:514:35: left side has type unsigned char static [unsigned] [toplevel] sizeof_ptr_field arch/x86/kernel/ds.c:514:35: right side has type bad type enum bts_field field arch/x86/kernel/ds.c:514:7: error: invalid assignment arch/x86/kernel/ds.c:514:35: error: incompatible types for operation (*) arch/x86/kernel/ds.c:514:35: left side has type unsigned char static [unsigned] [toplevel] sizeof_ptr_field arch/x86/kernel/ds.c:514:35: right side has type bad type enum bts_field field arch/x86/kernel/ds.c:514:7: error: invalid assignment arch/x86/kernel/ds.c:514:35: error: incompatible types for operation (*) arch/x86/kernel/ds.c:514:35: left side has type unsigned char static [unsigned] [toplevel] sizeof_ptr_field arch/x86/kernel/ds.c:514:35: right side has type bad type enum bts_field field arch/x86/kernel/ds.c:514:7: error: invalid assignment arch/x86/kernel/ds.c:514:35: error: incompatible types for operation (*) arch/x86/kernel/ds.c:514:35: left side has type unsigned char static [unsigned] [toplevel] sizeof_ptr_field arch/x86/kernel/ds.c:514:35: right side has type bad type enum bts_field field arch/x86/kernel/ds.c:514:7: error: invalid assignment arch/x86/kernel/ds.c:514:35: error: incompatible types for operation (*) arch/x86/kernel/ds.c:514:35: left side has type unsigned char static [unsigned] [toplevel] sizeof_ptr_field arch/x86/kernel/ds.c:514:35: right side has type bad type enum bts_field field arch/x86/kernel/ds.c:514:7: error: invalid assignment arch/x86/kernel/ds.c:520:35: error: incompatible types for operation (*) arch/x86/kernel/ds.c:520:35: left side has type unsigned char static [unsigned] [toplevel] sizeof_ptr_field arch/x86/kernel/ds.c:520:35: right side has type bad type enum bts_field field arch/x86/kernel/ds.c:520:7: error: invalid assignment arch/x86/kernel/ds.c:520:35: error: incompatible types for operation (*) Signed-off-by: Jaswinder Singh Rajput Cc: Markus Metzger LKML-Reference: <1245494740.8613.12.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 48bfe1386038..ef42a038f1a6 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -509,15 +509,15 @@ enum bts_field { bts_escape = ((unsigned long)-1 & ~bts_qual_mask) }; -static inline unsigned long bts_get(const char *base, enum bts_field field) +static inline unsigned long bts_get(const char *base, unsigned long field) { base += (ds_cfg.sizeof_ptr_field * field); return *(unsigned long *)base; } -static inline void bts_set(char *base, enum bts_field field, unsigned long val) +static inline void bts_set(char *base, unsigned long field, unsigned long val) { - base += (ds_cfg.sizeof_ptr_field * field);; + base += (ds_cfg.sizeof_ptr_field * field); (*(unsigned long *)base) = val; } -- cgit v1.2.3 From e487683990972bf9aa4e688434c46ead76748bca Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 20 Jun 2009 23:27:16 -0700 Subject: x86, mce: fix typo in comment in asm/mce.h Fix comment to match the actual declaration. Signed-off-by: Borislav Petkov Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 5cdd8d100ec9..b50b9e9042c4 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -9,7 +9,7 @@ */ #define MCG_BANKCNT_MASK 0xff /* Number of Banks */ -#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */ +#define MCG_CTL_P (1ULL<<8) /* MCG_CTL register available */ #define MCG_EXT_P (1ULL<<9) /* Extended registers available */ #define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ #define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */ -- cgit v1.2.3 From a95436e44a76a32dcbe7c8df59701ddde53017c1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 20 Jun 2009 23:28:22 -0700 Subject: x86, mce: use atomic_inc_return() instead of add by 1 Use atomic_inc_return() instead of atomic_add_return() by 1. Signed-off-by: Borislav Petkov Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 284d1de968bc..7da8fec9ca88 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -242,7 +242,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) /* * Make sure only one CPU runs in machine check panic */ - if (atomic_add_return(1, &mce_paniced) > 1) + if (atomic_inc_return(&mce_paniced) > 1) wait_for_panic(); barrier(); @@ -705,7 +705,7 @@ static int mce_start(int *no_way_out) * global_nwo should be updated before mce_callin */ smp_wmb(); - order = atomic_add_return(1, &mce_callin); + order = atomic_inc_return(&mce_callin); /* * Wait for everyone. -- cgit v1.2.3 From c17ef45342cc033fdf7bdd5b28615e0090f8d2e7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Jun 2009 17:12:47 -0700 Subject: rcu: Remove Classic RCU Remove Classic RCU, given that the combination of Tree RCU and the proposed Bloatwatch RCU do everything that Classic RCU can with fewer bugs. Tree RCU has been default in x86 builds for almost six months, and seems to be quite reliable, so there does not seem to be much justification for keeping the Classic RCU code and config complexity around anymore. Signed-off-by: Paul E. McKenney Cc: akpm@linux-foundation.org Cc: niv@us.ibm.com Cc: dvhltc@us.ibm.com Cc: dipankar@in.ibm.com Cc: dhowells@redhat.com Cc: lethal@linux-sh.org Cc: kernel@wantstofly.org Signed-off-by: Ingo Molnar --- include/linux/rcuclassic.h | 178 ---------- include/linux/rcupdate.h | 4 +- init/Kconfig | 12 +- kernel/Makefile | 1 - kernel/rcuclassic.c | 807 --------------------------------------------- 5 files changed, 3 insertions(+), 999 deletions(-) delete mode 100644 include/linux/rcuclassic.h delete mode 100644 kernel/rcuclassic.c diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h deleted file mode 100644 index bfd92e1e5d2c..000000000000 --- a/include/linux/rcuclassic.h +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Read-Copy Update mechanism for mutual exclusion (classic version) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2001 - * - * Author: Dipankar Sarma - * - * Based on the original work by Paul McKenney - * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. - * Papers: - * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf - * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU - * - */ - -#ifndef __LINUX_RCUCLASSIC_H -#define __LINUX_RCUCLASSIC_H - -#include -#include -#include -#include -#include - -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR -#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rcp->jiffies_stall */ -#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rcp->jiffies_stall */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - -/* Global control variables for rcupdate callback mechanism. */ -struct rcu_ctrlblk { - long cur; /* Current batch number. */ - long completed; /* Number of the last completed batch */ - long pending; /* Number of the last pending batch */ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - unsigned long gp_start; /* Time at which GP started in jiffies. */ - unsigned long jiffies_stall; - /* Time at which to check for CPU stalls. */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - - int signaled; - - spinlock_t lock ____cacheline_internodealigned_in_smp; - DECLARE_BITMAP(cpumask, NR_CPUS); /* CPUs that need to switch for */ - /* current batch to proceed. */ -} ____cacheline_internodealigned_in_smp; - -/* Is batch a before batch b ? */ -static inline int rcu_batch_before(long a, long b) -{ - return (a - b) < 0; -} - -/* Is batch a after batch b ? */ -static inline int rcu_batch_after(long a, long b) -{ - return (a - b) > 0; -} - -/* Per-CPU data for Read-Copy UPdate. */ -struct rcu_data { - /* 1) quiescent state handling : */ - long quiescbatch; /* Batch # for grace period */ - int passed_quiesc; /* User-mode/idle loop etc. */ - int qs_pending; /* core waits for quiesc state */ - - /* 2) batch handling */ - /* - * if nxtlist is not NULL, then: - * batch: - * The batch # for the last entry of nxtlist - * [*nxttail[1], NULL = *nxttail[2]): - * Entries that batch # <= batch - * [*nxttail[0], *nxttail[1]): - * Entries that batch # <= batch - 1 - * [nxtlist, *nxttail[0]): - * Entries that batch # <= batch - 2 - * The grace period for these entries has completed, and - * the other grace-period-completed entries may be moved - * here temporarily in rcu_process_callbacks(). - */ - long batch; - struct rcu_head *nxtlist; - struct rcu_head **nxttail[3]; - long qlen; /* # of queued callbacks */ - struct rcu_head *donelist; - struct rcu_head **donetail; - long blimit; /* Upper limit on a processed batch */ - int cpu; - struct rcu_head barrier; -}; - -/* - * Increment the quiescent state counter. - * The counter is a bit degenerated: We do not need to know - * how many quiescent states passed, just if there was at least - * one since the start of the grace period. Thus just a flag. - */ -extern void rcu_qsctr_inc(int cpu); -extern void rcu_bh_qsctr_inc(int cpu); - -extern int rcu_pending(int cpu); -extern int rcu_needs_cpu(int cpu); - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -extern struct lockdep_map rcu_lock_map; -# define rcu_read_acquire() \ - lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_) -# define rcu_read_release() lock_release(&rcu_lock_map, 1, _THIS_IP_) -#else -# define rcu_read_acquire() do { } while (0) -# define rcu_read_release() do { } while (0) -#endif - -#define __rcu_read_lock() \ - do { \ - preempt_disable(); \ - __acquire(RCU); \ - rcu_read_acquire(); \ - } while (0) -#define __rcu_read_unlock() \ - do { \ - rcu_read_release(); \ - __release(RCU); \ - preempt_enable(); \ - } while (0) -#define __rcu_read_lock_bh() \ - do { \ - local_bh_disable(); \ - __acquire(RCU_BH); \ - rcu_read_acquire(); \ - } while (0) -#define __rcu_read_unlock_bh() \ - do { \ - rcu_read_release(); \ - __release(RCU_BH); \ - local_bh_enable(); \ - } while (0) - -#define __synchronize_sched() synchronize_rcu() - -#define call_rcu_sched(head, func) call_rcu(head, func) - -extern void __rcu_init(void); -#define rcu_init_sched() do { } while (0) -extern void rcu_check_callbacks(int cpu, int user); -extern void rcu_restart_cpu(int cpu); - -extern long rcu_batches_completed(void); -extern long rcu_batches_completed_bh(void); - -#define rcu_enter_nohz() do { } while (0) -#define rcu_exit_nohz() do { } while (0) - -/* A context switch is a grace period for rcuclassic. */ -static inline int rcu_blocking_is_gp(void) -{ - return num_online_cpus() == 1; -} - -#endif /* __LINUX_RCUCLASSIC_H */ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 15fbb3ca634d..0cdfdb622faa 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -54,9 +54,7 @@ struct rcu_head { /* Internal to kernel, but needed by rcupreempt.h. */ extern int rcu_scheduler_active; -#if defined(CONFIG_CLASSIC_RCU) -#include -#elif defined(CONFIG_TREE_RCU) +#if defined(CONFIG_TREE_RCU) #include #elif defined(CONFIG_PREEMPT_RCU) #include diff --git a/init/Kconfig b/init/Kconfig index 1ce05a4cb5f6..d10f31dfa0b2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -316,21 +316,13 @@ choice prompt "RCU Implementation" default TREE_RCU -config CLASSIC_RCU - bool "Classic RCU" - help - This option selects the classic RCU implementation that is - designed for best read-side performance on non-realtime - systems. - - Select this option if you are unsure. - config TREE_RCU bool "Tree-based hierarchical RCU" help This option selects the RCU implementation that is designed for very large SMP system with hundreds or - thousands of CPUs. + thousands of CPUs. It also scales down nicely to + smaller systems. config PREEMPT_RCU bool "Preemptible RCU" diff --git a/kernel/Makefile b/kernel/Makefile index 0a32cb21ec97..d6042bcc9e99 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -80,7 +80,6 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o -obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o obj-$(CONFIG_TREE_RCU) += rcutree.o obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c deleted file mode 100644 index 0f2b0b311304..000000000000 --- a/kernel/rcuclassic.c +++ /dev/null @@ -1,807 +0,0 @@ -/* - * Read-Copy Update mechanism for mutual exclusion - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2001 - * - * Authors: Dipankar Sarma - * Manfred Spraul - * - * Based on the original work by Paul McKenney - * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. - * Papers: - * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf - * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -static struct lock_class_key rcu_lock_key; -struct lockdep_map rcu_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); -EXPORT_SYMBOL_GPL(rcu_lock_map); -#endif - - -/* Definition for rcupdate control block. */ -static struct rcu_ctrlblk rcu_ctrlblk = { - .cur = -300, - .completed = -300, - .pending = -300, - .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), - .cpumask = CPU_BITS_NONE, -}; - -static struct rcu_ctrlblk rcu_bh_ctrlblk = { - .cur = -300, - .completed = -300, - .pending = -300, - .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), - .cpumask = CPU_BITS_NONE, -}; - -static DEFINE_PER_CPU(struct rcu_data, rcu_data); -static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); - -/* - * Increment the quiescent state counter. - * The counter is a bit degenerated: We do not need to know - * how many quiescent states passed, just if there was at least - * one since the start of the grace period. Thus just a flag. - */ -void rcu_qsctr_inc(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - rdp->passed_quiesc = 1; -} - -void rcu_bh_qsctr_inc(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); - rdp->passed_quiesc = 1; -} - -static int blimit = 10; -static int qhimark = 10000; -static int qlowmark = 100; - -#ifdef CONFIG_SMP -static void force_quiescent_state(struct rcu_data *rdp, - struct rcu_ctrlblk *rcp) -{ - int cpu; - unsigned long flags; - - set_need_resched(); - spin_lock_irqsave(&rcp->lock, flags); - if (unlikely(!rcp->signaled)) { - rcp->signaled = 1; - /* - * Don't send IPI to itself. With irqs disabled, - * rdp->cpu is the current cpu. - * - * cpu_online_mask is updated by the _cpu_down() - * using __stop_machine(). Since we're in irqs disabled - * section, __stop_machine() is not exectuting, hence - * the cpu_online_mask is stable. - * - * However, a cpu might have been offlined _just_ before - * we disabled irqs while entering here. - * And rcu subsystem might not yet have handled the CPU_DEAD - * notification, leading to the offlined cpu's bit - * being set in the rcp->cpumask. - * - * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent - * sending smp_reschedule() to an offlined CPU. - */ - for_each_cpu_and(cpu, - to_cpumask(rcp->cpumask), cpu_online_mask) { - if (cpu != rdp->cpu) - smp_send_reschedule(cpu); - } - } - spin_unlock_irqrestore(&rcp->lock, flags); -} -#else -static inline void force_quiescent_state(struct rcu_data *rdp, - struct rcu_ctrlblk *rcp) -{ - set_need_resched(); -} -#endif - -static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) -{ - long batch; - - head->next = NULL; - smp_mb(); /* Read of rcu->cur must happen after any change by caller. */ - - /* - * Determine the batch number of this callback. - * - * Using ACCESS_ONCE to avoid the following error when gcc eliminates - * local variable "batch" and emits codes like this: - * 1) rdp->batch = rcp->cur + 1 # gets old value - * ...... - * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value - * then [*nxttail[0], *nxttail[1]) may contain callbacks - * that batch# = rdp->batch, see the comment of struct rcu_data. - */ - batch = ACCESS_ONCE(rcp->cur) + 1; - - if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) { - /* process callbacks */ - rdp->nxttail[0] = rdp->nxttail[1]; - rdp->nxttail[1] = rdp->nxttail[2]; - if (rcu_batch_after(batch - 1, rdp->batch)) - rdp->nxttail[0] = rdp->nxttail[2]; - } - - rdp->batch = batch; - *rdp->nxttail[2] = head; - rdp->nxttail[2] = &head->next; - - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_ctrlblk); - } -} - -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - -static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) -{ - rcp->gp_start = jiffies; - rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; -} - -static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) -{ - int cpu; - long delta; - unsigned long flags; - - /* Only let one CPU complain about others per time interval. */ - - spin_lock_irqsave(&rcp->lock, flags); - delta = jiffies - rcp->jiffies_stall; - if (delta < 2 || rcp->cur != rcp->completed) { - spin_unlock_irqrestore(&rcp->lock, flags); - return; - } - rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; - spin_unlock_irqrestore(&rcp->lock, flags); - - /* OK, time to rat on our buddy... */ - - printk(KERN_ERR "INFO: RCU detected CPU stalls:"); - for_each_possible_cpu(cpu) { - if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask))) - printk(" %d", cpu); - } - printk(" (detected by %d, t=%ld jiffies)\n", - smp_processor_id(), (long)(jiffies - rcp->gp_start)); -} - -static void print_cpu_stall(struct rcu_ctrlblk *rcp) -{ - unsigned long flags; - - printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", - smp_processor_id(), jiffies, - jiffies - rcp->gp_start); - dump_stack(); - spin_lock_irqsave(&rcp->lock, flags); - if ((long)(jiffies - rcp->jiffies_stall) >= 0) - rcp->jiffies_stall = - jiffies + RCU_SECONDS_TILL_STALL_RECHECK; - spin_unlock_irqrestore(&rcp->lock, flags); - set_need_resched(); /* kick ourselves to get things going. */ -} - -static void check_cpu_stall(struct rcu_ctrlblk *rcp) -{ - long delta; - - delta = jiffies - rcp->jiffies_stall; - if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) && - delta >= 0) { - - /* We haven't checked in, so go dump stack. */ - print_cpu_stall(rcp); - - } else if (rcp->cur != rcp->completed && delta >= 2) { - - /* They had two seconds to dump stack, so complain. */ - print_other_cpu_stall(rcp); - } -} - -#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - -static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) -{ -} - -static inline void check_cpu_stall(struct rcu_ctrlblk *rcp) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - -/** - * call_rcu - Queue an RCU callback for invocation after a grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual update function to be invoked after the grace period - * - * The update function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), - * and may be nested. - */ -void call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)) -{ - unsigned long flags; - - head->func = func; - local_irq_save(flags); - __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data)); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(call_rcu); - -/** - * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual update function to be invoked after the grace period - * - * The update function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_bh() assumes - * that the read-side critical sections end on completion of a softirq - * handler. This means that read-side critical sections in process - * context must not be interrupted by softirqs. This interface is to be - * used when most of the read-side critical sections are in softirq context. - * RCU read-side critical sections are delimited by rcu_read_lock() and - * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh() - * and rcu_read_unlock_bh(), if in process context. These may be nested. - */ -void call_rcu_bh(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)) -{ - unsigned long flags; - - head->func = func; - local_irq_save(flags); - __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(call_rcu_bh); - -/* - * Return the number of RCU batches processed thus far. Useful - * for debug and statistics. - */ -long rcu_batches_completed(void) -{ - return rcu_ctrlblk.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - -/* - * Return the number of RCU batches processed thus far. Useful - * for debug and statistics. - */ -long rcu_batches_completed_bh(void) -{ - return rcu_bh_ctrlblk.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); - -/* Raises the softirq for processing rcu_callbacks. */ -static inline void raise_rcu_softirq(void) -{ - raise_softirq(RCU_SOFTIRQ); -} - -/* - * Invoke the completed RCU callbacks. They are expected to be in - * a per-cpu list. - */ -static void rcu_do_batch(struct rcu_data *rdp) -{ - unsigned long flags; - struct rcu_head *next, *list; - int count = 0; - - list = rdp->donelist; - while (list) { - next = list->next; - prefetch(next); - list->func(list); - list = next; - if (++count >= rdp->blimit) - break; - } - rdp->donelist = list; - - local_irq_save(flags); - rdp->qlen -= count; - local_irq_restore(flags); - if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) - rdp->blimit = blimit; - - if (!rdp->donelist) - rdp->donetail = &rdp->donelist; - else - raise_rcu_softirq(); -} - -/* - * Grace period handling: - * The grace period handling consists out of two steps: - * - A new grace period is started. - * This is done by rcu_start_batch. The start is not broadcasted to - * all cpus, they must pick this up by comparing rcp->cur with - * rdp->quiescbatch. All cpus are recorded in the - * rcu_ctrlblk.cpumask bitmap. - * - All cpus must go through a quiescent state. - * Since the start of the grace period is not broadcasted, at least two - * calls to rcu_check_quiescent_state are required: - * The first call just notices that a new grace period is running. The - * following calls check if there was a quiescent state since the beginning - * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If - * the bitmap is empty, then the grace period is completed. - * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace - * period (if necessary). - */ - -/* - * Register a new batch of callbacks, and start it up if there is currently no - * active batch and the batch to be registered has not already occurred. - * Caller must hold rcu_ctrlblk.lock. - */ -static void rcu_start_batch(struct rcu_ctrlblk *rcp) -{ - if (rcp->cur != rcp->pending && - rcp->completed == rcp->cur) { - rcp->cur++; - record_gp_stall_check_time(rcp); - - /* - * Accessing nohz_cpu_mask before incrementing rcp->cur needs a - * Barrier Otherwise it can cause tickless idle CPUs to be - * included in rcp->cpumask, which will extend graceperiods - * unnecessarily. - */ - smp_mb(); - cpumask_andnot(to_cpumask(rcp->cpumask), - cpu_online_mask, nohz_cpu_mask); - - rcp->signaled = 0; - } -} - -/* - * cpu went through a quiescent state since the beginning of the grace period. - * Clear it from the cpu mask and complete the grace period if it was the last - * cpu. Start another grace period if someone has further entries pending - */ -static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) -{ - cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask)); - if (cpumask_empty(to_cpumask(rcp->cpumask))) { - /* batch completed ! */ - rcp->completed = rcp->cur; - rcu_start_batch(rcp); - } -} - -/* - * Check if the cpu has gone through a quiescent state (say context - * switch). If so and if it already hasn't done so in this RCU - * quiescent cycle, then indicate that it has done so. - */ -static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) -{ - unsigned long flags; - - if (rdp->quiescbatch != rcp->cur) { - /* start new grace period: */ - rdp->qs_pending = 1; - rdp->passed_quiesc = 0; - rdp->quiescbatch = rcp->cur; - return; - } - - /* Grace period already completed for this cpu? - * qs_pending is checked instead of the actual bitmap to avoid - * cacheline trashing. - */ - if (!rdp->qs_pending) - return; - - /* - * Was there a quiescent state since the beginning of the grace - * period? If no, then exit and wait for the next call. - */ - if (!rdp->passed_quiesc) - return; - rdp->qs_pending = 0; - - spin_lock_irqsave(&rcp->lock, flags); - /* - * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync - * during cpu startup. Ignore the quiescent state. - */ - if (likely(rdp->quiescbatch == rcp->cur)) - cpu_quiet(rdp->cpu, rcp); - - spin_unlock_irqrestore(&rcp->lock, flags); -} - - -#ifdef CONFIG_HOTPLUG_CPU - -/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing - * locking requirements, the list it's pulling from has to belong to a cpu - * which is dead and hence not processing interrupts. - */ -static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, - struct rcu_head **tail, long batch) -{ - unsigned long flags; - - if (list) { - local_irq_save(flags); - this_rdp->batch = batch; - *this_rdp->nxttail[2] = list; - this_rdp->nxttail[2] = tail; - local_irq_restore(flags); - } -} - -static void __rcu_offline_cpu(struct rcu_data *this_rdp, - struct rcu_ctrlblk *rcp, struct rcu_data *rdp) -{ - unsigned long flags; - - /* - * if the cpu going offline owns the grace period - * we can block indefinitely waiting for it, so flush - * it here - */ - spin_lock_irqsave(&rcp->lock, flags); - if (rcp->cur != rcp->completed) - cpu_quiet(rdp->cpu, rcp); - rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1); - rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1); - spin_unlock(&rcp->lock); - - this_rdp->qlen += rdp->qlen; - local_irq_restore(flags); -} - -static void rcu_offline_cpu(int cpu) -{ - struct rcu_data *this_rdp = &get_cpu_var(rcu_data); - struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); - - __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, - &per_cpu(rcu_data, cpu)); - __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, - &per_cpu(rcu_bh_data, cpu)); - put_cpu_var(rcu_data); - put_cpu_var(rcu_bh_data); -} - -#else - -static void rcu_offline_cpu(int cpu) -{ -} - -#endif - -/* - * This does the RCU processing work from softirq context. - */ -static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) -{ - unsigned long flags; - long completed_snap; - - if (rdp->nxtlist) { - local_irq_save(flags); - completed_snap = ACCESS_ONCE(rcp->completed); - - /* - * move the other grace-period-completed entries to - * [rdp->nxtlist, *rdp->nxttail[0]) temporarily - */ - if (!rcu_batch_before(completed_snap, rdp->batch)) - rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2]; - else if (!rcu_batch_before(completed_snap, rdp->batch - 1)) - rdp->nxttail[0] = rdp->nxttail[1]; - - /* - * the grace period for entries in - * [rdp->nxtlist, *rdp->nxttail[0]) has completed and - * move these entries to donelist - */ - if (rdp->nxttail[0] != &rdp->nxtlist) { - *rdp->donetail = rdp->nxtlist; - rdp->donetail = rdp->nxttail[0]; - rdp->nxtlist = *rdp->nxttail[0]; - *rdp->donetail = NULL; - - if (rdp->nxttail[1] == rdp->nxttail[0]) - rdp->nxttail[1] = &rdp->nxtlist; - if (rdp->nxttail[2] == rdp->nxttail[0]) - rdp->nxttail[2] = &rdp->nxtlist; - rdp->nxttail[0] = &rdp->nxtlist; - } - - local_irq_restore(flags); - - if (rcu_batch_after(rdp->batch, rcp->pending)) { - unsigned long flags2; - - /* and start it/schedule start if it's a new batch */ - spin_lock_irqsave(&rcp->lock, flags2); - if (rcu_batch_after(rdp->batch, rcp->pending)) { - rcp->pending = rdp->batch; - rcu_start_batch(rcp); - } - spin_unlock_irqrestore(&rcp->lock, flags2); - } - } - - rcu_check_quiescent_state(rcp, rdp); - if (rdp->donelist) - rcu_do_batch(rdp); -} - -static void rcu_process_callbacks(struct softirq_action *unused) -{ - /* - * Memory references from any prior RCU read-side critical sections - * executed by the interrupted code must be see before any RCU - * grace-period manupulations below. - */ - - smp_mb(); /* See above block comment. */ - - __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); - __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); - - /* - * Memory references from any later RCU read-side critical sections - * executed by the interrupted code must be see after any RCU - * grace-period manupulations above. - */ - - smp_mb(); /* See above block comment. */ -} - -static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) -{ - /* Check for CPU stalls, if enabled. */ - check_cpu_stall(rcp); - - if (rdp->nxtlist) { - long completed_snap = ACCESS_ONCE(rcp->completed); - - /* - * This cpu has pending rcu entries and the grace period - * for them has completed. - */ - if (!rcu_batch_before(completed_snap, rdp->batch)) - return 1; - if (!rcu_batch_before(completed_snap, rdp->batch - 1) && - rdp->nxttail[0] != rdp->nxttail[1]) - return 1; - if (rdp->nxttail[0] != &rdp->nxtlist) - return 1; - - /* - * This cpu has pending rcu entries and the new batch - * for then hasn't been started nor scheduled start - */ - if (rcu_batch_after(rdp->batch, rcp->pending)) - return 1; - } - - /* This cpu has finished callbacks to invoke */ - if (rdp->donelist) - return 1; - - /* The rcu core waits for a quiescent state from the cpu */ - if (rdp->quiescbatch != rcp->cur || rdp->qs_pending) - return 1; - - /* nothing to do */ - return 0; -} - -/* - * Check to see if there is any immediate RCU-related work to be done - * by the current CPU, returning 1 if so. This function is part of the - * RCU implementation; it is -not- an exported member of the RCU API. - */ -int rcu_pending(int cpu) -{ - return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || - __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)); -} - -/* - * Check to see if any future RCU-related work will need to be done - * by the current CPU, even if none need be done immediately, returning - * 1 if so. This function is part of the RCU implementation; it is -not- - * an exported member of the RCU API. - */ -int rcu_needs_cpu(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); - - return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu); -} - -/* - * Top-level function driving RCU grace-period detection, normally - * invoked from the scheduler-clock interrupt. This function simply - * increments counters that are read only from softirq by this same - * CPU, so there are no memory barriers required. - */ -void rcu_check_callbacks(int cpu, int user) -{ - if (user || - (idle_cpu(cpu) && rcu_scheduler_active && - !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { - - /* - * Get here if this CPU took its interrupt from user - * mode or from the idle loop, and if this is not a - * nested interrupt. In this case, the CPU is in - * a quiescent state, so count it. - * - * Also do a memory barrier. This is needed to handle - * the case where writes from a preempt-disable section - * of code get reordered into schedule() by this CPU's - * write buffer. The memory barrier makes sure that - * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see - * by other CPUs to happen after any such write. - */ - - smp_mb(); /* See above block comment. */ - rcu_qsctr_inc(cpu); - rcu_bh_qsctr_inc(cpu); - - } else if (!in_softirq()) { - - /* - * Get here if this CPU did not take its interrupt from - * softirq, in other words, if it is not interrupting - * a rcu_bh read-side critical section. This is an _bh - * critical section, so count it. The memory barrier - * is needed for the same reason as is the above one. - */ - - smp_mb(); /* See above block comment. */ - rcu_bh_qsctr_inc(cpu); - } - raise_rcu_softirq(); -} - -static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) -{ - unsigned long flags; - - spin_lock_irqsave(&rcp->lock, flags); - memset(rdp, 0, sizeof(*rdp)); - rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist; - rdp->donetail = &rdp->donelist; - rdp->quiescbatch = rcp->completed; - rdp->qs_pending = 0; - rdp->cpu = cpu; - rdp->blimit = blimit; - spin_unlock_irqrestore(&rcp->lock, flags); -} - -static void __cpuinit rcu_online_cpu(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu); - - rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); - rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); -} - -static int __cpuinit rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - rcu_online_cpu(cpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - rcu_offline_cpu(cpu); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata rcu_nb = { - .notifier_call = rcu_cpu_notify, -}; - -/* - * Initializes rcu mechanism. Assumed to be called early. - * That is before local timer(SMP) or jiffie timer (uniproc) is setup. - * Note that rcu_qsctr and friends are implicitly - * initialized due to the choice of ``0'' for RCU_CTR_INVALID. - */ -void __init __rcu_init(void) -{ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - /* Register notifier for non-boot CPUs */ - register_cpu_notifier(&rcu_nb); -} - -module_param(blimit, int, 0); -module_param(qhimark, int, 0); -module_param(qlowmark, int, 0); -- cgit v1.2.3 From 2495fbf7effa6868f5d74124ae9b22a57980755b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 26 Jun 2009 10:53:57 -0700 Subject: x86, setup: remove obsolete pre-Kconfig CONFIG_VIDEO_ variables There were a set of pre-Kconfig configuration variables defined in the video code. There is absolutely no evidence that they have been tweaked by anybody in modern history, so just get rid of them and hope nobody notices. If someone does complain, these should be made real Kconfig variables. Reported-by: Robert P. J. Day Signed-off-by: H. Peter Anvin --- arch/x86/boot/video-vesa.c | 7 +------ arch/x86/boot/video-vga.c | 10 ---------- arch/x86/boot/video.c | 5 ----- arch/x86/boot/video.h | 20 ++------------------ 4 files changed, 3 insertions(+), 39 deletions(-) diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index c700147d6ffb..d7ef26ba4540 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c @@ -31,7 +31,6 @@ static inline void vesa_store_mode_params_graphics(void) {} static int vesa_probe(void) { -#if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID) struct biosregs ireg, oreg; u16 mode; addr_t mode_ptr; @@ -49,8 +48,7 @@ static int vesa_probe(void) vginfo.signature != VESA_MAGIC || vginfo.version < 0x0102) return 0; /* Not present */ -#endif /* CONFIG_VIDEO_VESA || CONFIG_FIRMWARE_EDID */ -#ifdef CONFIG_VIDEO_VESA + set_fs(vginfo.video_mode_ptr.seg); mode_ptr = vginfo.video_mode_ptr.off; @@ -102,9 +100,6 @@ static int vesa_probe(void) } return nmodes; -#else - return 0; -#endif /* CONFIG_VIDEO_VESA */ } static int vesa_set_mode(struct mode_info *mode) diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c index 8f8d827e254d..819caa1f2008 100644 --- a/arch/x86/boot/video-vga.c +++ b/arch/x86/boot/video-vga.c @@ -47,14 +47,6 @@ static u8 vga_set_basic_mode(void) initregs(&ireg); -#ifdef CONFIG_VIDEO_400_HACK - if (adapter >= ADAPTER_VGA) { - ireg.ax = 0x1202; - ireg.bx = 0x0030; - intcall(0x10, &ireg, NULL); - } -#endif - ax = 0x0f00; intcall(0x10, &ireg, &oreg); mode = oreg.al; @@ -62,11 +54,9 @@ static u8 vga_set_basic_mode(void) set_fs(0); rows = rdfs8(0x484); /* rows minus one */ -#ifndef CONFIG_VIDEO_400_HACK if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) && (rows == 0 || rows == 24)) return mode; -#endif if (mode != 3 && mode != 7) mode = 3; diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index bad728b76fc2..d42da3802499 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -221,7 +221,6 @@ static unsigned int mode_menu(void) } } -#ifdef CONFIG_VIDEO_RETAIN /* Save screen content to the heap */ static struct saved_screen { int x, y; @@ -299,10 +298,6 @@ static void restore_screen(void) ireg.dl = saved.curx; intcall(0x10, &ireg, NULL); } -#else -#define save_screen() ((void)0) -#define restore_screen() ((void)0) -#endif void set_video(void) { diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h index 5bb174a997fc..ff339c5db311 100644 --- a/arch/x86/boot/video.h +++ b/arch/x86/boot/video.h @@ -17,19 +17,8 @@ #include -/* Enable autodetection of SVGA adapters and modes. */ -#undef CONFIG_VIDEO_SVGA - -/* Enable autodetection of VESA modes */ -#define CONFIG_VIDEO_VESA - -/* Retain screen contents when switching modes */ -#define CONFIG_VIDEO_RETAIN - -/* Force 400 scan lines for standard modes (hack to fix bad BIOS behaviour */ -#undef CONFIG_VIDEO_400_HACK - -/* This code uses an extended set of video mode numbers. These include: +/* + * This code uses an extended set of video mode numbers. These include: * Aliases for standard modes * NORMAL_VGA (-1) * EXTENDED_VGA (-2) @@ -67,13 +56,8 @@ /* The "recalculate timings" flag */ #define VIDEO_RECALC 0x8000 -/* Define DO_STORE according to CONFIG_VIDEO_RETAIN */ -#ifdef CONFIG_VIDEO_RETAIN void store_screen(void); #define DO_STORE() store_screen() -#else -#define DO_STORE() ((void)0) -#endif /* CONFIG_VIDEO_RETAIN */ /* * Mode table structures -- cgit v1.2.3 From 087975b06b00af9bf888fab6f94ae113c5cd80bd Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sat, 27 Jun 2009 15:35:15 +0900 Subject: x86: Clean up dump_pagetable() Use pgtable access helpers for 32-bit version dump_pagetable() and get rid of __typeof__() operators. This needs to make pmd_pfn() available for 2-level pgtable. Also, remove some casts for 64-bit version dump_pagetable(). Signed-off-by: Akinobu Mita LKML-Reference: <20090627063514.GA2834@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable.h | 10 ++++----- arch/x86/mm/fault.c | 51 +++++++++++++++++------------------------- 2 files changed, 26 insertions(+), 35 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 3cc06e3fceb8..af5481e94863 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -134,6 +134,11 @@ static inline unsigned long pte_pfn(pte_t pte) return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT; } +static inline unsigned long pmd_pfn(pmd_t pmd) +{ + return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; +} + #define pte_page(pte) pfn_to_page(pte_pfn(pte)) static inline int pmd_large(pmd_t pte) @@ -422,11 +427,6 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); } -static inline unsigned long pmd_pfn(pmd_t pmd) -{ - return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; -} - static inline int pud_large(pud_t pud) { return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) == diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 78a5fff857be..9bf7e52c2869 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -285,26 +285,25 @@ check_v8086_mode(struct pt_regs *regs, unsigned long address, tsk->thread.screen_bitmap |= 1 << bit; } -static void dump_pagetable(unsigned long address) +static bool low_pfn(unsigned long pfn) { - __typeof__(pte_val(__pte(0))) page; + return pfn < max_low_pfn; +} - page = read_cr3(); - page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; +static void dump_pagetable(unsigned long address) +{ + pgd_t *base = __va(read_cr3()); + pgd_t *pgd = &base[pgd_index(address)]; + pmd_t *pmd; + pte_t *pte; #ifdef CONFIG_X86_PAE - printk("*pdpt = %016Lx ", page); - if ((page >> PAGE_SHIFT) < max_low_pfn - && page & _PAGE_PRESENT) { - page &= PAGE_MASK; - page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) - & (PTRS_PER_PMD - 1)]; - printk(KERN_CONT "*pde = %016Lx ", page); - page &= ~_PAGE_NX; - } -#else - printk("*pde = %08lx ", page); + printk("*pdpt = %016Lx ", pgd_val(*pgd)); + if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) + goto out; #endif + pmd = pmd_offset(pud_offset(pgd, address), address); + printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); /* * We must not directly access the pte in the highpte @@ -312,16 +311,12 @@ static void dump_pagetable(unsigned long address) * And let's rather not kmap-atomic the pte, just in case * it's allocated already: */ - if ((page >> PAGE_SHIFT) < max_low_pfn - && (page & _PAGE_PRESENT) - && !(page & _PAGE_PSE)) { - - page &= PAGE_MASK; - page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) - & (PTRS_PER_PTE - 1)]; - printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); - } + if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd)) + goto out; + pte = pte_offset_kernel(pmd, address); + printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); +out: printk("\n"); } @@ -449,16 +444,12 @@ static int bad_address(void *p) static void dump_pagetable(unsigned long address) { - pgd_t *pgd; + pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); + pgd_t *pgd = base + pgd_index(address); pud_t *pud; pmd_t *pmd; pte_t *pte; - pgd = (pgd_t *)read_cr3(); - - pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); - - pgd += pgd_index(address); if (bad_address(pgd)) goto bad; -- cgit v1.2.3 From ce0c0f9eec2f377055e8b23c6fa192202381e022 Mon Sep 17 00:00:00 2001 From: "Figo.zhang" Date: Sun, 28 Jun 2009 18:07:39 +0800 Subject: x86, pgtable.h: Clean up types Use "unsigned long" consistently, not "unsigned". Signed-off-by: Figo.zhang LKML-Reference: <1246183659.2530.4.camel@myhost> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index af5481e94863..9de8729c1c8f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -356,7 +356,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) * this macro returns the index of the entry in the pmd page which would * control the given virtual address */ -static inline unsigned pmd_index(unsigned long address) +static inline unsigned long pmd_index(unsigned long address) { return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); } @@ -376,7 +376,7 @@ static inline unsigned pmd_index(unsigned long address) * this function returns the index of the entry in the pte page which would * control the given virtual address */ -static inline unsigned pte_index(unsigned long address) +static inline unsigned long pte_index(unsigned long address) { return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); } @@ -462,7 +462,7 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) #define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) /* to find an entry in a page-table-directory. */ -static inline unsigned pud_index(unsigned long address) +static inline unsigned long pud_index(unsigned long address) { return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); } -- cgit v1.2.3 From 565b0c1f100408ccbcb04ba458a14da454cb271d Mon Sep 17 00:00:00 2001 From: "Figo.zhang" Date: Mon, 29 Jun 2009 12:02:55 +0800 Subject: x86, highmem_32.c: Clean up comment Signed-off-by: Figo.zhang Cc: Andrew Morton LKML-Reference: <1246248175.5759.12.camel@myhost> Signed-off-by: Ingo Molnar --- arch/x86/mm/highmem_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 58f621e81919..0c6f43cee25d 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -24,7 +24,7 @@ void kunmap(struct page *page) * no global lock is needed and because the kmap code must perform a global TLB * invalidation when the kmap pool wraps. * - * However when holding an atomic kmap is is not legal to sleep, so atomic + * However when holding an atomic kmap it is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) -- cgit v1.2.3 From 54d35f29f49224d86b994acb6e5969b9ba09022d Mon Sep 17 00:00:00 2001 From: Hitoshi Mitake Date: Mon, 29 Jun 2009 14:44:57 +0900 Subject: sched: Hide runqueues from direct reference at source code level for __raw_get_cpu_var() Hide __raw_get_cpu_var() as well - thus all the direct references to runqueues will abstracted out. Signed-off-by: Hitoshi Mitake LKML-Reference: <20090629.144457.886429910353660979.mitake@dcl.info.waseda.ac.jp> Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 168b2680ae27..ebc5151f88c8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -692,6 +692,7 @@ static inline int cpu_of(struct rq *rq) #define this_rq() (&__get_cpu_var(runqueues)) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#define raw_rq() (&__raw_get_cpu_var(runqueues)) inline void update_rq_clock(struct rq *rq) { @@ -6669,7 +6670,7 @@ EXPORT_SYMBOL(yield); */ void __sched io_schedule(void) { - struct rq *rq = &__raw_get_cpu_var(runqueues); + struct rq *rq = raw_rq(); delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); @@ -6681,7 +6682,7 @@ EXPORT_SYMBOL(io_schedule); long __sched io_schedule_timeout(long timeout) { - struct rq *rq = &__raw_get_cpu_var(runqueues); + struct rq *rq = raw_rq(); long ret; delayacct_blkio_start(); -- cgit v1.2.3 From c5cb5a2d8d7dc872cf1504091ad0e59fe5ff7cb5 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 30 Jun 2009 17:08:14 -0400 Subject: kprobes: Clean up insn_pages by using list instead of hlist Use struct list instead of struct hlist for managing insn_pages, because insn_pages doesn't use hash table. Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli LKML-Reference: <20090630210814.17851.64651.stgit@localhost.localdomain> Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 16b5739c516a..6fe9dc6d1a81 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -103,7 +103,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = { #define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) struct kprobe_insn_page { - struct hlist_node hlist; + struct list_head list; kprobe_opcode_t *insns; /* Page of instruction slots */ char slot_used[INSNS_PER_PAGE]; int nused; @@ -117,7 +117,7 @@ enum kprobe_slot_state { }; static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ -static struct hlist_head kprobe_insn_pages; +static LIST_HEAD(kprobe_insn_pages); static int kprobe_garbage_slots; static int collect_garbage_slots(void); @@ -152,10 +152,9 @@ loop_end: static kprobe_opcode_t __kprobes *__get_insn_slot(void) { struct kprobe_insn_page *kip; - struct hlist_node *pos; retry: - hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { + list_for_each_entry(kip, &kprobe_insn_pages, list) { if (kip->nused < INSNS_PER_PAGE) { int i; for (i = 0; i < INSNS_PER_PAGE; i++) { @@ -189,8 +188,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void) kfree(kip); return NULL; } - INIT_HLIST_NODE(&kip->hlist); - hlist_add_head(&kip->hlist, &kprobe_insn_pages); + INIT_LIST_HEAD(&kip->list); + list_add(&kip->list, &kprobe_insn_pages); memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE); kip->slot_used[0] = SLOT_USED; kip->nused = 1; @@ -219,12 +218,8 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) * so as not to have to set it up again the * next time somebody inserts a probe. */ - hlist_del(&kip->hlist); - if (hlist_empty(&kprobe_insn_pages)) { - INIT_HLIST_NODE(&kip->hlist); - hlist_add_head(&kip->hlist, - &kprobe_insn_pages); - } else { + if (!list_is_singular(&kprobe_insn_pages)) { + list_del(&kip->list); module_free(NULL, kip->insns); kfree(kip); } @@ -235,14 +230,13 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) static int __kprobes collect_garbage_slots(void) { - struct kprobe_insn_page *kip; - struct hlist_node *pos, *next; + struct kprobe_insn_page *kip, *next; /* Ensure no-one is preepmted on the garbages */ if (check_safety()) return -EAGAIN; - hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { + list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) { int i; if (kip->ngarbage == 0) continue; @@ -260,19 +254,17 @@ static int __kprobes collect_garbage_slots(void) void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) { struct kprobe_insn_page *kip; - struct hlist_node *pos; mutex_lock(&kprobe_insn_mutex); - hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { + list_for_each_entry(kip, &kprobe_insn_pages, list) { if (kip->insns <= slot && slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { int i = (slot - kip->insns) / MAX_INSN_SIZE; if (dirty) { kip->slot_used[i] = SLOT_DIRTY; kip->ngarbage++; - } else { + } else collect_one_slot(kip, i); - } break; } } -- cgit v1.2.3 From 020e5f85cb087a40572c8b8b2dd06292a14fa212 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 1 Jul 2009 10:47:05 +0800 Subject: tracing/events: Add trace_event boot option We already have ftrace= boot option, and this adds a similar boot option for trace events, so allow trace events to be enabled at boot, for boot debugging purpose. Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A4ACE29.3010407@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 5 +++++ Documentation/trace/events.txt | 9 +++++++++ kernel/trace/trace.c | 4 ++-- kernel/trace/trace.h | 3 +++ kernel/trace/trace_events.c | 37 +++++++++++++++++++++++++++++++++---- 5 files changed, 52 insertions(+), 6 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index d3f41db3ed49..2582e7aea29f 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2478,6 +2478,11 @@ and is between 256 and 4096 characters. It is defined in the file trace_buf_size=nn[KMG] [FTRACE] will set tracing buffer size. + trace_event=[event-list] + [FTRACE] Set and start specified trace events in order + to facilitate early boot debugging. + See also Documentation/trace/events.txt + trix= [HW,OSS] MediaTrix AudioTrix Pro Format: ,,,,,,,, diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt index f157d7594ea7..2bcc8d4dea29 100644 --- a/Documentation/trace/events.txt +++ b/Documentation/trace/events.txt @@ -83,6 +83,15 @@ When reading one of these enable files, there are four results: X - there is a mixture of events enabled and disabled ? - this file does not affect any event +2.3 Boot option +--------------- + +In order to facilitate early boot debugging, use boot option: + + trace_event=[event-list] + +The format of this boot option is the same as described in section 2.1. + 3. Defining an event-enabled tracepoint ======================================= diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3aa0a0dfdfa8..bdb3afc8b306 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -49,7 +49,7 @@ unsigned long __read_mostly tracing_thresh; * On boot up, the ring buffer is set to the minimum size, so that * we do not waste memory on systems that are not using tracing. */ -static int ring_buffer_expanded; +int ring_buffer_expanded; /* * We need to change this state when a selftest is running. @@ -63,7 +63,7 @@ static bool __read_mostly tracing_selftest_running; /* * If a tracer is running, we do not want to run SELFTEST. */ -static bool __read_mostly tracing_selftest_disabled; +bool __read_mostly tracing_selftest_disabled; /* For tracers that don't implement custom flags */ static struct tracer_opt dummy_tracer_opt[] = { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3548ae5cc780..52eb0d8dcd75 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -517,6 +517,9 @@ extern unsigned long ftrace_update_tot_cnt; extern int DYN_FTRACE_TEST_NAME(void); #endif +extern int ring_buffer_expanded; +extern bool tracing_selftest_disabled; + #ifdef CONFIG_FTRACE_STARTUP_TEST extern int trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 53c8fd376a88..fecac1314cbe 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -17,6 +17,8 @@ #include #include +#include + #include "trace_output.h" #define TRACE_SYSTEM "TRACE_SYSTEM" @@ -1133,6 +1135,18 @@ struct notifier_block trace_module_nb = { extern struct ftrace_event_call __start_ftrace_events[]; extern struct ftrace_event_call __stop_ftrace_events[]; +static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; + +static __init int setup_trace_event(char *str) +{ + strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); + ring_buffer_expanded = 1; + tracing_selftest_disabled = 1; + + return 1; +} +__setup("trace_event=", setup_trace_event); + static __init int event_trace_init(void) { struct ftrace_event_call *call; @@ -1140,6 +1154,8 @@ static __init int event_trace_init(void) struct dentry *entry; struct dentry *d_events; int ret; + char *buf = bootup_event_buf; + char *token; d_tracer = tracing_init_dentry(); if (!d_tracer) @@ -1185,6 +1201,19 @@ static __init int event_trace_init(void) &ftrace_event_format_fops); } + while (true) { + token = strsep(&buf, ","); + + if (!token) + break; + if (!*token) + continue; + + ret = ftrace_set_clr_event(token, 1); + if (ret) + pr_warning("Failed to enable trace event: %s\n", token); + } + ret = register_module_notifier(&trace_module_nb); if (ret) pr_warning("Failed to register trace events module notifier\n"); @@ -1392,10 +1421,10 @@ static __init void event_trace_self_test_with_function(void) static __init int event_trace_self_tests_init(void) { - - event_trace_self_tests(); - - event_trace_self_test_with_function(); + if (!tracing_selftest_disabled) { + event_trace_self_tests(); + event_trace_self_test_with_function(); + } return 0; } -- cgit v1.2.3 From 9d22b536609abf0d64648f99518676ea58245e3b Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 1 Jul 2009 19:52:30 +0530 Subject: x86: Mark ptrace_get_debugreg() as static This sparse warning: arch/x86/kernel/ptrace.c:560:15: warning: symbol 'ptrace_get_debugreg' was not declared. Should it be static? triggers because ptrace_get_debugreg() is global but is only used in a single .c file. change ptrace_get_debugreg() to static to fix that - this also addresses the sparse warning. Signed-off-by: Jaswinder Singh Rajput Cc: Steven Rostedt LKML-Reference: <1246458150.6940.19.camel@hpdv5.satnam> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index b457f78b7dbf..cabdabce3cb2 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -557,7 +557,7 @@ restore: /* * Handle PTRACE_PEEKUSR calls for the debug register area. */ -unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) +static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) { struct thread_struct *thread = &(tsk->thread); unsigned long val = 0; -- cgit v1.2.3 From 03b042bf1dc14a268a3d65d38b4ec2a4261e8477 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 Jun 2009 09:08:16 -0700 Subject: rcu: Add synchronize_sched_expedited() primitive This adds the synchronize_sched_expedited() primitive that implements the "big hammer" expedited RCU grace periods. This primitive is placed in kernel/sched.c rather than kernel/rcupdate.c due to its need to interact closely with the migration_thread() kthread. The idea is to wake up this kthread with req->task set to NULL, in response to which the kthread reports the quiescent state resulting from the kthread having been scheduled. Because this patch needs to fallback to the slow versions of the primitives in response to some races with CPU onlining and offlining, a new synchronize_rcu_bh() primitive is added as well. Signed-off-by: Paul E. McKenney Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Cc: davem@davemloft.net Cc: dada1@cosmosbay.com Cc: zbr@ioremap.net Cc: jeff.chua.linux@gmail.com Cc: paulus@samba.org Cc: laijs@cn.fujitsu.com Cc: jengelh@medozas.de Cc: r000n@r000n.net Cc: benh@kernel.crashing.org Cc: mathieu.desnoyers@polymtl.ca LKML-Reference: <12459460982947-git-send-email-> Signed-off-by: Ingo Molnar --- include/linux/rcupdate.h | 25 ++++----- include/linux/rcupreempt.h | 10 ++++ include/linux/rcutree.h | 12 ++++- kernel/rcupdate.c | 25 +++++++++ kernel/sched.c | 129 ++++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 186 insertions(+), 15 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 0cdfdb622faa..3c89d6a2591f 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -51,7 +51,19 @@ struct rcu_head { void (*func)(struct rcu_head *head); }; -/* Internal to kernel, but needed by rcupreempt.h. */ +/* Exported common interfaces */ +extern void synchronize_rcu(void); +extern void synchronize_rcu_bh(void); +extern void rcu_barrier(void); +extern void rcu_barrier_bh(void); +extern void rcu_barrier_sched(void); +extern void synchronize_sched_expedited(void); +extern int sched_expedited_torture_stats(char *page); + +/* Internal to kernel */ +extern void rcu_init(void); +extern void rcu_scheduler_starting(void); +extern int rcu_needs_cpu(int cpu); extern int rcu_scheduler_active; #if defined(CONFIG_TREE_RCU) @@ -257,15 +269,4 @@ extern void call_rcu(struct rcu_head *head, extern void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *head)); -/* Exported common interfaces */ -extern void synchronize_rcu(void); -extern void rcu_barrier(void); -extern void rcu_barrier_bh(void); -extern void rcu_barrier_sched(void); - -/* Internal to kernel */ -extern void rcu_init(void); -extern void rcu_scheduler_starting(void); -extern int rcu_needs_cpu(int cpu); - #endif /* __LINUX_RCUPDATE_H */ diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h index fce522782ffa..f164ac9b7807 100644 --- a/include/linux/rcupreempt.h +++ b/include/linux/rcupreempt.h @@ -74,6 +74,16 @@ extern int rcu_needs_cpu(int cpu); extern void __synchronize_sched(void); +static inline void synchronize_rcu_expedited(void) +{ + synchronize_rcu(); /* Placeholder for new rcupreempt implementation. */ +} + +static inline void synchronize_rcu_bh_expedited(void) +{ + synchronize_rcu_bh(); /* Placeholder for new rcupreempt impl. */ +} + extern void __rcu_init(void); extern void rcu_init_sched(void); extern void rcu_check_callbacks(int cpu, int user); diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 5a5153806c42..d4dfd2489633 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -286,8 +286,14 @@ static inline void __rcu_read_unlock_bh(void) #define call_rcu_sched(head, func) call_rcu(head, func) -static inline void rcu_init_sched(void) +static inline void synchronize_rcu_expedited(void) +{ + synchronize_sched_expedited(); +} + +static inline void synchronize_rcu_bh_expedited(void) { + synchronize_sched_expedited(); } extern void __rcu_init(void); @@ -297,6 +303,10 @@ extern void rcu_restart_cpu(int cpu); extern long rcu_batches_completed(void); extern long rcu_batches_completed_bh(void); +static inline void rcu_init_sched(void) +{ +} + #ifdef CONFIG_NO_HZ void rcu_enter_nohz(void); void rcu_exit_nohz(void); diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a967c9feb90a..eae29c25fb14 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -98,6 +98,30 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); +/** + * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. + * + * Control will return to the caller some time after a full rcu_bh grace + * period has elapsed, in other words after all currently executing rcu_bh + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), + * and may be nested. + */ +void synchronize_rcu_bh(void) +{ + struct rcu_synchronize rcu; + + if (rcu_blocking_is_gp()) + return; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_bh(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_bh); + static void rcu_barrier_callback(struct rcu_head *notused) { if (atomic_dec_and_test(&rcu_barrier_cpu_count)) @@ -129,6 +153,7 @@ static void rcu_barrier_func(void *type) static inline void wait_migrated_callbacks(void) { wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); + smp_mb(); /* In case we didn't sleep. */ } /* diff --git a/kernel/sched.c b/kernel/sched.c index 7c9098d186e6..9ae80bec1c1e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7024,6 +7024,11 @@ fail: return ret; } +#define RCU_MIGRATION_IDLE 0 +#define RCU_MIGRATION_NEED_QS 1 +#define RCU_MIGRATION_GOT_QS 2 +#define RCU_MIGRATION_MUST_SYNC 3 + /* * migration_thread - this is a highprio system thread that performs * thread migration by bumping thread off CPU then 'pushing' onto @@ -7031,6 +7036,7 @@ fail: */ static int migration_thread(void *data) { + int badcpu; int cpu = (long)data; struct rq *rq; @@ -7065,8 +7071,17 @@ static int migration_thread(void *data) req = list_entry(head->next, struct migration_req, list); list_del_init(head->next); - spin_unlock(&rq->lock); - __migrate_task(req->task, cpu, req->dest_cpu); + if (req->task != NULL) { + spin_unlock(&rq->lock); + __migrate_task(req->task, cpu, req->dest_cpu); + } else if (likely(cpu == (badcpu = smp_processor_id()))) { + req->dest_cpu = RCU_MIGRATION_GOT_QS; + spin_unlock(&rq->lock); + } else { + req->dest_cpu = RCU_MIGRATION_MUST_SYNC; + spin_unlock(&rq->lock); + WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); + } local_irq_enable(); complete(&req->done); @@ -10554,3 +10569,113 @@ struct cgroup_subsys cpuacct_subsys = { .subsys_id = cpuacct_subsys_id, }; #endif /* CONFIG_CGROUP_CPUACCT */ + +#ifndef CONFIG_SMP + +int rcu_expedited_torture_stats(char *page) +{ + return 0; +} +EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); + +void synchronize_sched_expedited(void) +{ +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#else /* #ifndef CONFIG_SMP */ + +static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); +static DEFINE_MUTEX(rcu_sched_expedited_mutex); + +#define RCU_EXPEDITED_STATE_POST -2 +#define RCU_EXPEDITED_STATE_IDLE -1 + +static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; + +int rcu_expedited_torture_stats(char *page) +{ + int cnt = 0; + int cpu; + + cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); + for_each_online_cpu(cpu) { + cnt += sprintf(&page[cnt], " %d:%d", + cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); + } + cnt += sprintf(&page[cnt], "\n"); + return cnt; +} +EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); + +static long synchronize_sched_expedited_count; + +/* + * Wait for an rcu-sched grace period to elapse, but use "big hammer" + * approach to force grace period to end quickly. This consumes + * significant time on all CPUs, and is thus not recommended for + * any sort of common-case code. + * + * Note that it is illegal to call this function while holding any + * lock that is acquired by a CPU-hotplug notifier. Failing to + * observe this restriction will result in deadlock. + */ +void synchronize_sched_expedited(void) +{ + int cpu; + unsigned long flags; + bool need_full_sync = 0; + struct rq *rq; + struct migration_req *req; + long snap; + int trycount = 0; + + smp_mb(); /* ensure prior mod happens before capturing snap. */ + snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; + get_online_cpus(); + while (!mutex_trylock(&rcu_sched_expedited_mutex)) { + put_online_cpus(); + if (trycount++ < 10) + udelay(trycount * num_online_cpus()); + else { + synchronize_sched(); + return; + } + if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { + smp_mb(); /* ensure test happens before caller kfree */ + return; + } + get_online_cpus(); + } + rcu_expedited_state = RCU_EXPEDITED_STATE_POST; + for_each_online_cpu(cpu) { + rq = cpu_rq(cpu); + req = &per_cpu(rcu_migration_req, cpu); + init_completion(&req->done); + req->task = NULL; + req->dest_cpu = RCU_MIGRATION_NEED_QS; + spin_lock_irqsave(&rq->lock, flags); + list_add(&req->list, &rq->migration_queue); + spin_unlock_irqrestore(&rq->lock, flags); + wake_up_process(rq->migration_thread); + } + for_each_online_cpu(cpu) { + rcu_expedited_state = cpu; + req = &per_cpu(rcu_migration_req, cpu); + rq = cpu_rq(cpu); + wait_for_completion(&req->done); + spin_lock_irqsave(&rq->lock, flags); + if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) + need_full_sync = 1; + req->dest_cpu = RCU_MIGRATION_IDLE; + spin_unlock_irqrestore(&rq->lock, flags); + } + rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; + mutex_unlock(&rcu_sched_expedited_mutex); + put_online_cpus(); + if (need_full_sync) + synchronize_sched(); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#endif /* #else #ifndef CONFIG_SMP */ -- cgit v1.2.3 From 0acc512cb1a29636df5e982c7d845edafe77c2d0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 Jun 2009 09:08:17 -0700 Subject: rcu: Add synchronize_sched_expedited() torture tests This patch adds rcutorture tests for the new synchronize_sched_expedited() primitive, and also does some whitespace cleanups in kernel/rcutorture.c as well. Signed-off-by: Paul E. McKenney Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Cc: davem@davemloft.net Cc: dada1@cosmosbay.com Cc: zbr@ioremap.net Cc: jeff.chua.linux@gmail.com Cc: paulus@samba.org Cc: laijs@cn.fujitsu.com Cc: jengelh@medozas.de Cc: r000n@r000n.net Cc: benh@kernel.crashing.org Cc: mathieu.desnoyers@polymtl.ca LKML-Reference: <12459460981342-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutorture.c | 202 ++++++++++++++++++++++++++++------------------------ 1 file changed, 110 insertions(+), 92 deletions(-) diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 9b4a975a4b4a..b33db539a8ad 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -257,14 +257,14 @@ struct rcu_torture_ops { void (*init)(void); void (*cleanup)(void); int (*readlock)(void); - void (*readdelay)(struct rcu_random_state *rrsp); + void (*read_delay)(struct rcu_random_state *rrsp); void (*readunlock)(int idx); int (*completed)(void); - void (*deferredfree)(struct rcu_torture *p); + void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*cb_barrier)(void); int (*stats)(char *page); - int irqcapable; + int irq_capable; char *name; }; static struct rcu_torture_ops *cur_ops = NULL; @@ -320,7 +320,7 @@ rcu_torture_cb(struct rcu_head *p) rp->rtort_mbtest = 0; rcu_torture_free(rp); } else - cur_ops->deferredfree(rp); + cur_ops->deferred_free(rp); } static void rcu_torture_deferred_free(struct rcu_torture *p) @@ -329,18 +329,18 @@ static void rcu_torture_deferred_free(struct rcu_torture *p) } static struct rcu_torture_ops rcu_ops = { - .init = NULL, - .cleanup = NULL, - .readlock = rcu_torture_read_lock, - .readdelay = rcu_read_delay, - .readunlock = rcu_torture_read_unlock, - .completed = rcu_torture_completed, - .deferredfree = rcu_torture_deferred_free, - .sync = synchronize_rcu, - .cb_barrier = rcu_barrier, - .stats = NULL, - .irqcapable = 1, - .name = "rcu" + .init = NULL, + .cleanup = NULL, + .readlock = rcu_torture_read_lock, + .read_delay = rcu_read_delay, + .readunlock = rcu_torture_read_unlock, + .completed = rcu_torture_completed, + .deferred_free = rcu_torture_deferred_free, + .sync = synchronize_rcu, + .cb_barrier = rcu_barrier, + .stats = NULL, + .irq_capable = 1, + .name = "rcu" }; static void rcu_sync_torture_deferred_free(struct rcu_torture *p) @@ -370,18 +370,18 @@ static void rcu_sync_torture_init(void) } static struct rcu_torture_ops rcu_sync_ops = { - .init = rcu_sync_torture_init, - .cleanup = NULL, - .readlock = rcu_torture_read_lock, - .readdelay = rcu_read_delay, - .readunlock = rcu_torture_read_unlock, - .completed = rcu_torture_completed, - .deferredfree = rcu_sync_torture_deferred_free, - .sync = synchronize_rcu, - .cb_barrier = NULL, - .stats = NULL, - .irqcapable = 1, - .name = "rcu_sync" + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = rcu_torture_read_lock, + .read_delay = rcu_read_delay, + .readunlock = rcu_torture_read_unlock, + .completed = rcu_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = synchronize_rcu, + .cb_barrier = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "rcu_sync" }; /* @@ -432,33 +432,33 @@ static void rcu_bh_torture_synchronize(void) } static struct rcu_torture_ops rcu_bh_ops = { - .init = NULL, - .cleanup = NULL, - .readlock = rcu_bh_torture_read_lock, - .readdelay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, - .deferredfree = rcu_bh_torture_deferred_free, - .sync = rcu_bh_torture_synchronize, - .cb_barrier = rcu_barrier_bh, - .stats = NULL, - .irqcapable = 1, - .name = "rcu_bh" + .init = NULL, + .cleanup = NULL, + .readlock = rcu_bh_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_bh_torture_read_unlock, + .completed = rcu_bh_torture_completed, + .deferred_free = rcu_bh_torture_deferred_free, + .sync = rcu_bh_torture_synchronize, + .cb_barrier = rcu_barrier_bh, + .stats = NULL, + .irq_capable = 1, + .name = "rcu_bh" }; static struct rcu_torture_ops rcu_bh_sync_ops = { - .init = rcu_sync_torture_init, - .cleanup = NULL, - .readlock = rcu_bh_torture_read_lock, - .readdelay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, - .deferredfree = rcu_sync_torture_deferred_free, - .sync = rcu_bh_torture_synchronize, - .cb_barrier = NULL, - .stats = NULL, - .irqcapable = 1, - .name = "rcu_bh_sync" + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = rcu_bh_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_bh_torture_read_unlock, + .completed = rcu_bh_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = rcu_bh_torture_synchronize, + .cb_barrier = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "rcu_bh_sync" }; /* @@ -530,17 +530,17 @@ static int srcu_torture_stats(char *page) } static struct rcu_torture_ops srcu_ops = { - .init = srcu_torture_init, - .cleanup = srcu_torture_cleanup, - .readlock = srcu_torture_read_lock, - .readdelay = srcu_read_delay, - .readunlock = srcu_torture_read_unlock, - .completed = srcu_torture_completed, - .deferredfree = rcu_sync_torture_deferred_free, - .sync = srcu_torture_synchronize, - .cb_barrier = NULL, - .stats = srcu_torture_stats, - .name = "srcu" + .init = srcu_torture_init, + .cleanup = srcu_torture_cleanup, + .readlock = srcu_torture_read_lock, + .read_delay = srcu_read_delay, + .readunlock = srcu_torture_read_unlock, + .completed = srcu_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = srcu_torture_synchronize, + .cb_barrier = NULL, + .stats = srcu_torture_stats, + .name = "srcu" }; /* @@ -574,32 +574,49 @@ static void sched_torture_synchronize(void) } static struct rcu_torture_ops sched_ops = { - .init = rcu_sync_torture_init, - .cleanup = NULL, - .readlock = sched_torture_read_lock, - .readdelay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = sched_torture_read_unlock, - .completed = sched_torture_completed, - .deferredfree = rcu_sched_torture_deferred_free, - .sync = sched_torture_synchronize, - .cb_barrier = rcu_barrier_sched, - .stats = NULL, - .irqcapable = 1, - .name = "sched" + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = sched_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = sched_torture_read_unlock, + .completed = sched_torture_completed, + .deferred_free = rcu_sched_torture_deferred_free, + .sync = sched_torture_synchronize, + .cb_barrier = rcu_barrier_sched, + .stats = NULL, + .irq_capable = 1, + .name = "sched" }; static struct rcu_torture_ops sched_ops_sync = { - .init = rcu_sync_torture_init, - .cleanup = NULL, - .readlock = sched_torture_read_lock, - .readdelay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = sched_torture_read_unlock, - .completed = sched_torture_completed, - .deferredfree = rcu_sync_torture_deferred_free, - .sync = sched_torture_synchronize, - .cb_barrier = NULL, - .stats = NULL, - .name = "sched_sync" + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = sched_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = sched_torture_read_unlock, + .completed = sched_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = sched_torture_synchronize, + .cb_barrier = NULL, + .stats = NULL, + .name = "sched_sync" +}; + +extern int rcu_expedited_torture_stats(char *page); + +static struct rcu_torture_ops sched_expedited_ops = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = sched_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = sched_torture_read_unlock, + .completed = sched_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = synchronize_sched_expedited, + .cb_barrier = NULL, + .stats = rcu_expedited_torture_stats, + .irq_capable = 1, + .name = "sched_expedited" }; /* @@ -635,7 +652,7 @@ rcu_torture_writer(void *arg) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); old_rp->rtort_pipe_count++; - cur_ops->deferredfree(old_rp); + cur_ops->deferred_free(old_rp); } rcu_torture_current_version++; oldbatch = cur_ops->completed(); @@ -700,7 +717,7 @@ static void rcu_torture_timer(unsigned long unused) if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); spin_lock(&rand_lock); - cur_ops->readdelay(&rand); + cur_ops->read_delay(&rand); n_rcu_torture_timers++; spin_unlock(&rand_lock); preempt_disable(); @@ -738,11 +755,11 @@ rcu_torture_reader(void *arg) VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); set_user_nice(current, 19); - if (irqreader && cur_ops->irqcapable) + if (irqreader && cur_ops->irq_capable) setup_timer_on_stack(&t, rcu_torture_timer, 0); do { - if (irqreader && cur_ops->irqcapable) { + if (irqreader && cur_ops->irq_capable) { if (!timer_pending(&t)) mod_timer(&t, 1); } @@ -757,7 +774,7 @@ rcu_torture_reader(void *arg) } if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); - cur_ops->readdelay(&rand); + cur_ops->read_delay(&rand); preempt_disable(); pipe_count = p->rtort_pipe_count; if (pipe_count > RCU_TORTURE_PIPE_LEN) { @@ -778,7 +795,7 @@ rcu_torture_reader(void *arg) } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); rcutorture_shutdown_absorb("rcu_torture_reader"); - if (irqreader && cur_ops->irqcapable) + if (irqreader && cur_ops->irq_capable) del_timer_sync(&t); while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); @@ -1078,6 +1095,7 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, + &sched_expedited_ops, &srcu_ops, &sched_ops, &sched_ops_sync, }; mutex_lock(&fullstop_mutex); -- cgit v1.2.3 From 240ebbf81f149b11a31e060ebe5ee51a3c775360 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 Jun 2009 09:08:18 -0700 Subject: rcu: Add synchronize_sched_expedited() rcutorture doc + updates This patch updates the rcutorture documentation to include updated output format. It also brings the RCU documentation up to date. Signed-off-by: Paul E. McKenney Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Cc: davem@davemloft.net Cc: dada1@cosmosbay.com Cc: zbr@ioremap.net Cc: jeff.chua.linux@gmail.com Cc: paulus@samba.org Cc: laijs@cn.fujitsu.com Cc: jengelh@medozas.de Cc: r000n@r000n.net Cc: benh@kernel.crashing.org Cc: mathieu.desnoyers@polymtl.ca LKML-Reference: <12459460983193-git-send-email-> Signed-off-by: Ingo Molnar --- Documentation/RCU/RTFP.txt | 77 ++++++++++++++++++++++++++++++++++++++++ Documentation/RCU/UP.txt | 34 +++++++++++++----- Documentation/RCU/checklist.txt | 20 ++++++++--- Documentation/RCU/rcubarrier.txt | 7 ++++ Documentation/RCU/torture.txt | 23 ++++++++++-- Documentation/RCU/whatisRCU.txt | 14 ++++++-- 6 files changed, 156 insertions(+), 19 deletions(-) diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt index 9f711d2df91b..d2b85237c76e 100644 --- a/Documentation/RCU/RTFP.txt +++ b/Documentation/RCU/RTFP.txt @@ -743,3 +743,80 @@ Revised: RCU, realtime RCU, sleepable RCU, performance. " } + +@article{PaulEMcKenney2008RCUOSR +,author="Paul E. McKenney and Jonathan Walpole" +,title="Introducing technology into the {Linux} kernel: a case study" +,Year="2008" +,journal="SIGOPS Oper. Syst. Rev." +,volume="42" +,number="5" +,pages="4--17" +,issn="0163-5980" +,doi={http://doi.acm.org/10.1145/1400097.1400099} +,publisher="ACM" +,address="New York, NY, USA" +,annotation={ + Linux changed RCU to a far greater degree than RCU has changed Linux. +} +} + +@unpublished{PaulEMcKenney2008HierarchicalRCU +,Author="Paul E. McKenney" +,Title="Hierarchical {RCU}" +,month="November" +,day="3" +,year="2008" +,note="Available: +\url{http://lwn.net/Articles/305782/} +[Viewed November 6, 2008]" +,annotation=" + RCU with combining-tree-based grace-period detection, + permitting it to handle thousands of CPUs. +" +} + +@conference{PaulEMcKenney2009MaliciousURCU +,Author="Paul E. McKenney" +,Title="Using a Malicious User-Level {RCU} to Torture {RCU}-Based Algorithms" +,Booktitle="linux.conf.au 2009" +,month="January" +,year="2009" +,address="Hobart, Australia" +,note="Available: +\url{http://www.rdrop.com/users/paulmck/RCU/urcutorture.2009.01.22a.pdf} +[Viewed February 2, 2009]" +,annotation=" + Realtime RCU and torture-testing RCU uses. +" +} + +@unpublished{MathieuDesnoyers2009URCU +,Author="Mathieu Desnoyers" +,Title="[{RFC} git tree] Userspace {RCU} (urcu) for {Linux}" +,month="February" +,day="5" +,year="2009" +,note="Available: +\url{http://lkml.org/lkml/2009/2/5/572} +\url{git://lttng.org/userspace-rcu.git} +[Viewed February 20, 2009]" +,annotation=" + Mathieu Desnoyers's user-space RCU implementation. + git://lttng.org/userspace-rcu.git +" +} + +@unpublished{PaulEMcKenney2009BloatWatchRCU +,Author="Paul E. McKenney" +,Title="{RCU}: The {Bloatwatch} Edition" +,month="March" +,day="17" +,year="2009" +,note="Available: +\url{http://lwn.net/Articles/323929/} +[Viewed March 20, 2009]" +,annotation=" + Uniprocessor assumptions allow simplified RCU implementation. +" +} diff --git a/Documentation/RCU/UP.txt b/Documentation/RCU/UP.txt index aab4a9ec3931..90ec5341ee98 100644 --- a/Documentation/RCU/UP.txt +++ b/Documentation/RCU/UP.txt @@ -2,14 +2,13 @@ RCU on Uniprocessor Systems A common misconception is that, on UP systems, the call_rcu() primitive -may immediately invoke its function, and that the synchronize_rcu() -primitive may return immediately. The basis of this misconception +may immediately invoke its function. The basis of this misconception is that since there is only one CPU, it should not be necessary to wait for anything else to get done, since there are no other CPUs for anything else to be happening on. Although this approach will -sort- -of- work a surprising amount of the time, it is a very bad idea in general. -This document presents three examples that demonstrate exactly how bad an -idea this is. +This document presents three examples that demonstrate exactly how bad +an idea this is. Example 1: softirq Suicide @@ -82,11 +81,18 @@ Quick Quiz #2: What locking restriction must RCU callbacks respect? Summary -Permitting call_rcu() to immediately invoke its arguments or permitting -synchronize_rcu() to immediately return breaks RCU, even on a UP system. -So do not do it! Even on a UP system, the RCU infrastructure -must- -respect grace periods, and -must- invoke callbacks from a known environment -in which no locks are held. +Permitting call_rcu() to immediately invoke its arguments breaks RCU, +even on a UP system. So do not do it! Even on a UP system, the RCU +infrastructure -must- respect grace periods, and -must- invoke callbacks +from a known environment in which no locks are held. + +It -is- safe for synchronize_sched() and synchronize_rcu_bh() to return +immediately on an UP system. It is also safe for synchronize_rcu() +to return immediately on UP systems, except when running preemptable +RCU. + +Quick Quiz #3: Why can't synchronize_rcu() return immediately on + UP systems running preemptable RCU? Answer to Quick Quiz #1: @@ -117,3 +123,13 @@ Answer to Quick Quiz #2: callbacks acquire locks directly. However, a great many RCU callbacks do acquire locks -indirectly-, for example, via the kfree() primitive. + +Answer to Quick Quiz #3: + Why can't synchronize_rcu() return immediately on UP systems + running preemptable RCU? + + Because some other task might have been preempted in the middle + of an RCU read-side critical section. If synchronize_rcu() + simply immediately returned, it would prematurely signal the + end of the grace period, which would come as a nasty shock to + that other thread when it started running again. diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index accfe2f5247d..51525a30e8b4 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt @@ -11,7 +11,10 @@ over a rather long period of time, but improvements are always welcome! structure is updated more than about 10% of the time, then you should strongly consider some other approach, unless detailed performance measurements show that RCU is nonetheless - the right tool for the job. + the right tool for the job. Yes, you might think of RCU + as simply cutting overhead off of the readers and imposing it + on the writers. That is exactly why normal uses of RCU will + do much more reading than updating. Another exception is where performance is not an issue, and RCU provides a simpler implementation. An example of this situation @@ -240,10 +243,11 @@ over a rather long period of time, but improvements are always welcome! instead need to use synchronize_irq() or synchronize_sched(). 12. Any lock acquired by an RCU callback must be acquired elsewhere - with irq disabled, e.g., via spin_lock_irqsave(). Failing to - disable irq on a given acquisition of that lock will result in - deadlock as soon as the RCU callback happens to interrupt that - acquisition's critical section. + with softirq disabled, e.g., via spin_lock_irqsave(), + spin_lock_bh(), etc. Failing to disable irq on a given + acquisition of that lock will result in deadlock as soon as the + RCU callback happens to interrupt that acquisition's critical + section. 13. RCU callbacks can be and are executed in parallel. In many cases, the callback code simply wrappers around kfree(), so that this @@ -310,3 +314,9 @@ over a rather long period of time, but improvements are always welcome! Because these primitives only wait for pre-existing readers, it is the caller's responsibility to guarantee safety to any subsequent readers. + +16. The various RCU read-side primitives do -not- contain memory + barriers. The CPU (and in some cases, the compiler) is free + to reorder code into and out of RCU read-side critical sections. + It is the responsibility of the RCU update-side primitives to + deal with this. diff --git a/Documentation/RCU/rcubarrier.txt b/Documentation/RCU/rcubarrier.txt index 909602d409bb..e439a0edee22 100644 --- a/Documentation/RCU/rcubarrier.txt +++ b/Documentation/RCU/rcubarrier.txt @@ -170,6 +170,13 @@ module invokes call_rcu() from timers, you will need to first cancel all the timers, and only then invoke rcu_barrier() to wait for any remaining RCU callbacks to complete. +Of course, if you module uses call_rcu_bh(), you will need to invoke +rcu_barrier_bh() before unloading. Similarly, if your module uses +call_rcu_sched(), you will need to invoke rcu_barrier_sched() before +unloading. If your module uses call_rcu(), call_rcu_bh(), -and- +call_rcu_sched(), then you will need to invoke each of rcu_barrier(), +rcu_barrier_bh(), and rcu_barrier_sched(). + Implementing rcu_barrier() diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt index a342b6e1cc10..9dba3bb90e60 100644 --- a/Documentation/RCU/torture.txt +++ b/Documentation/RCU/torture.txt @@ -76,8 +76,10 @@ torture_type The type of RCU to test: "rcu" for the rcu_read_lock() API, "rcu_sync" for rcu_read_lock() with synchronous reclamation, "rcu_bh" for the rcu_read_lock_bh() API, "rcu_bh_sync" for rcu_read_lock_bh() with synchronous reclamation, "srcu" for - the "srcu_read_lock()" API, and "sched" for the use of - preempt_disable() together with synchronize_sched(). + the "srcu_read_lock()" API, "sched" for the use of + preempt_disable() together with synchronize_sched(), + and "sched_expedited" for the use of preempt_disable() + with synchronize_sched_expedited(). verbose Enable debug printk()s. Default is disabled. @@ -162,6 +164,23 @@ of the "old" and "current" counters for the corresponding CPU. The "idx" value maps the "old" and "current" values to the underlying array, and is useful for debugging. +Similarly, sched_expedited RCU provides the following: + + sched_expedited-torture: rtc: d0000000016c1880 ver: 1090796 tfle: 0 rta: 1090796 rtaf: 0 rtf: 1090787 rtmbe: 0 nt: 27713319 + sched_expedited-torture: Reader Pipe: 12660320201 95875 0 0 0 0 0 0 0 0 0 + sched_expedited-torture: Reader Batch: 12660424885 0 0 0 0 0 0 0 0 0 0 + sched_expedited-torture: Free-Block Circulation: 1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0 + state: -1 / 0:0 3:0 4:0 + +As before, the first four lines are similar to those for RCU. +The last line shows the task-migration state. The first number is +-1 if synchronize_sched_expedited() is idle, -2 if in the process of +posting wakeups to the migration kthreads, and N when waiting on CPU N. +Each of the colon-separated fields following the "/" is a CPU:state pair. +Valid states are "0" for idle, "1" for waiting for quiescent state, +"2" for passed through quiescent state, and "3" when a race with a +CPU-hotplug event forces use of the synchronize_sched() primitive. + USAGE diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 96170824a717..97ded2432c59 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -785,6 +785,7 @@ RCU pointer/list traversal: rcu_dereference list_for_each_entry_rcu hlist_for_each_entry_rcu + hlist_nulls_for_each_entry_rcu list_for_each_continue_rcu (to be deprecated in favor of new list_for_each_entry_continue_rcu) @@ -807,19 +808,23 @@ RCU: Critical sections Grace period Barrier rcu_read_lock synchronize_net rcu_barrier rcu_read_unlock synchronize_rcu + synchronize_rcu_expedited call_rcu bh: Critical sections Grace period Barrier rcu_read_lock_bh call_rcu_bh rcu_barrier_bh - rcu_read_unlock_bh + rcu_read_unlock_bh synchronize_rcu_bh + synchronize_rcu_bh_expedited sched: Critical sections Grace period Barrier - [preempt_disable] synchronize_sched rcu_barrier_sched - [and friends] call_rcu_sched + rcu_read_lock_sched synchronize_sched rcu_barrier_sched + rcu_read_unlock_sched call_rcu_sched + [preempt_disable] synchronize_sched_expedited + [and friends] SRCU: Critical sections Grace period Barrier @@ -827,6 +832,9 @@ SRCU: Critical sections Grace period Barrier srcu_read_lock synchronize_srcu N/A srcu_read_unlock +SRCU: Initialization/cleanup + init_srcu_struct + cleanup_srcu_struct See the comment headers in the source code (or the docbook generated from them) for more information. -- cgit v1.2.3 From 4d09161196c9a836eacea4b36e2f217bc34894cf Mon Sep 17 00:00:00 2001 From: Robin Getz Date: Wed, 1 Jul 2009 21:08:37 -0400 Subject: printk: Enable the use of more than one CON_BOOT (early console) Today, register_console() assumes the following usage: - The first console to register with a flag set to CON_BOOT is the one and only bootconsole. - If another register_console() is called with an additional CON_BOOT, it is silently rejected. - As soon as a console without the CON_BOOT set calls registers the bootconsole is automatically unregistered. - Once there is a "real" console - register_console() will silently reject any consoles with it's CON_BOOT flag set. In many systems (alpha, blackfin, microblaze, mips, powerpc, sh, & x86), there are early_printk implementations, which use the CON_BOOT which come out serial ports, vga, usb, & memory buffers. In many embedded systems, it would be nice to have two bootconsoles - in case the primary fails, you always have access to a backup memory buffer - but this requires at least two CON_BOOT consoles... This patch enables that functionality. With the change applied, on boot you get (if you try to re-enable a boot console after the "real" console has been registered): root:/> dmesg | grep console bootconsole [early_shadow0] enabled bootconsole [early_BFuart0] enabled Kernel command line: root=/dev/mtdblock0 rw earlyprintk=serial,uart0,57600 console=ttyBF0,57600 nmi_debug=regs console handover:boot [early_BFuart0] boot [early_shadow0] -> real [ttyBF0] Too late to register bootconsole early_shadow0 or: root:/> dmesg | grep console Kernel command line: root=/dev/mtdblock0 rw console=ttyBF0,57600 console [ttyBF0] enabled Signed-off-by: Robin Getz Cc: "Linus Torvalds" Cc: "Andrew Morton" Cc: "Mike Frysinger" Cc: "Paul Mundt" LKML-Reference: <200907012108.38030.rgetz@blackfin.uclinux.org> Signed-off-by: Ingo Molnar --- kernel/printk.c | 146 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 92 insertions(+), 54 deletions(-) diff --git a/kernel/printk.c b/kernel/printk.c index b4d97b54c1ec..41fe60995530 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -36,6 +36,12 @@ #include +/* + * for_each_console() allows you to iterate on each console + */ +#define for_each_console(con) \ + for (con = console_drivers; con != NULL; con = con->next) + /* * Architectures can override it: */ @@ -412,7 +418,7 @@ static void __call_console_drivers(unsigned start, unsigned end) { struct console *con; - for (con = console_drivers; con; con = con->next) { + for_each_console(con) { if ((con->flags & CON_ENABLED) && con->write && (cpu_online(smp_processor_id()) || (con->flags & CON_ANYTIME))) @@ -544,7 +550,7 @@ static int have_callable_console(void) { struct console *con; - for (con = console_drivers; con; con = con->next) + for_each_console(con) if (con->flags & CON_ANYTIME) return 1; @@ -1082,7 +1088,7 @@ void console_unblank(void) console_locked = 1; console_may_schedule = 0; - for (c = console_drivers; c != NULL; c = c->next) + for_each_console(c) if ((c->flags & CON_ENABLED) && c->unblank) c->unblank(); release_console_sem(); @@ -1097,7 +1103,7 @@ struct tty_driver *console_device(int *index) struct tty_driver *driver = NULL; acquire_console_sem(); - for (c = console_drivers; c != NULL; c = c->next) { + for_each_console(c) { if (!c->device) continue; driver = c->device(c, index); @@ -1134,25 +1140,49 @@ EXPORT_SYMBOL(console_start); * to register the console printing procedure with printk() and to * print any messages that were printed by the kernel before the * console driver was initialized. + * + * This can happen pretty early during the boot process (because of + * early_printk) - sometimes before setup_arch() completes - be careful + * of what kernel features are used - they may not be initialised yet. + * + * There are two types of consoles - bootconsoles (early_printk) and + * "real" consoles (everything which is not a bootconsole) which are + * handled differently. + * - Any number of bootconsoles can be registered at any time. + * - As soon as a "real" console is registered, all bootconsoles + * will be unregistered automatically. + * - Once a "real" console is registered, any attempt to register a + * bootconsoles will be rejected */ -void register_console(struct console *console) +void register_console(struct console *newcon) { int i; unsigned long flags; - struct console *bootconsole = NULL; + struct console *bcon = NULL; - if (console_drivers) { - if (console->flags & CON_BOOT) - return; - if (console_drivers->flags & CON_BOOT) - bootconsole = console_drivers; + /* + * before we register a new CON_BOOT console, make sure we don't + * already have a valid console + */ + if (console_drivers && newcon->flags & CON_BOOT) { + /* find the last or real console */ + for_each_console(bcon) { + if (!(bcon->flags & CON_BOOT)) { + printk(KERN_INFO "Too late to register bootconsole %s%d\n", + newcon->name, newcon->index); + return; + } + } } - if (preferred_console < 0 || bootconsole || !console_drivers) + if (console_drivers && console_drivers->flags & CON_BOOT) + bcon = console_drivers; + + if (preferred_console < 0 || bcon || !console_drivers) preferred_console = selected_console; - if (console->early_setup) - console->early_setup(); + if (newcon->early_setup) + newcon->early_setup(); /* * See if we want to use this console driver. If we @@ -1160,13 +1190,13 @@ void register_console(struct console *console) * that registers here. */ if (preferred_console < 0) { - if (console->index < 0) - console->index = 0; - if (console->setup == NULL || - console->setup(console, NULL) == 0) { - console->flags |= CON_ENABLED; - if (console->device) { - console->flags |= CON_CONSDEV; + if (newcon->index < 0) + newcon->index = 0; + if (newcon->setup == NULL || + newcon->setup(newcon, NULL) == 0) { + newcon->flags |= CON_ENABLED; + if (newcon->device) { + newcon->flags |= CON_CONSDEV; preferred_console = 0; } } @@ -1178,47 +1208,53 @@ void register_console(struct console *console) */ for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) { - if (strcmp(console_cmdline[i].name, console->name) != 0) + if (strcmp(console_cmdline[i].name, newcon->name) != 0) continue; - if (console->index >= 0 && - console->index != console_cmdline[i].index) + if (newcon->index >= 0 && + newcon->index != console_cmdline[i].index) continue; - if (console->index < 0) - console->index = console_cmdline[i].index; + if (newcon->index < 0) + newcon->index = console_cmdline[i].index; #ifdef CONFIG_A11Y_BRAILLE_CONSOLE if (console_cmdline[i].brl_options) { - console->flags |= CON_BRL; - braille_register_console(console, + newcon->flags |= CON_BRL; + braille_register_console(newcon, console_cmdline[i].index, console_cmdline[i].options, console_cmdline[i].brl_options); return; } #endif - if (console->setup && - console->setup(console, console_cmdline[i].options) != 0) + if (newcon->setup && + newcon->setup(newcon, console_cmdline[i].options) != 0) break; - console->flags |= CON_ENABLED; - console->index = console_cmdline[i].index; + newcon->flags |= CON_ENABLED; + newcon->index = console_cmdline[i].index; if (i == selected_console) { - console->flags |= CON_CONSDEV; + newcon->flags |= CON_CONSDEV; preferred_console = selected_console; } break; } - if (!(console->flags & CON_ENABLED)) + if (!(newcon->flags & CON_ENABLED)) return; - if (bootconsole && (console->flags & CON_CONSDEV)) { - printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n", - bootconsole->name, bootconsole->index, - console->name, console->index); - unregister_console(bootconsole); - console->flags &= ~CON_PRINTBUFFER; + if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { + /* we need to iterate through twice, to make sure we print + * everything out, before we unregister the console(s) + */ + printk(KERN_INFO "console handover:"); + for_each_console(bcon) + printk("boot [%s%d] ", bcon->name, bcon->index); + printk(" -> real [%s%d]\n", newcon->name, newcon->index); + for_each_console(bcon) + unregister_console(bcon); + newcon->flags &= ~CON_PRINTBUFFER; } else { - printk(KERN_INFO "console [%s%d] enabled\n", - console->name, console->index); + printk(KERN_INFO "%sconsole [%s%d] enabled\n", + (newcon->flags & CON_BOOT) ? "boot" : "" , + newcon->name, newcon->index); } /* @@ -1226,16 +1262,16 @@ void register_console(struct console *console) * preferred driver at the head of the list. */ acquire_console_sem(); - if ((console->flags & CON_CONSDEV) || console_drivers == NULL) { - console->next = console_drivers; - console_drivers = console; - if (console->next) - console->next->flags &= ~CON_CONSDEV; + if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) { + newcon->next = console_drivers; + console_drivers = newcon; + if (newcon->next) + newcon->next->flags &= ~CON_CONSDEV; } else { - console->next = console_drivers->next; - console_drivers->next = console; + newcon->next = console_drivers->next; + console_drivers->next = newcon; } - if (console->flags & CON_PRINTBUFFER) { + if (newcon->flags & CON_PRINTBUFFER) { /* * release_console_sem() will print out the buffered messages * for us. @@ -1287,11 +1323,13 @@ EXPORT_SYMBOL(unregister_console); static int __init disable_boot_consoles(void) { - if (console_drivers != NULL) { - if (console_drivers->flags & CON_BOOT) { + struct console *con; + + for_each_console(con) { + if (con->flags & CON_BOOT) { printk(KERN_INFO "turn off boot console %s%d\n", - console_drivers->name, console_drivers->index); - return unregister_console(console_drivers); + con->name, con->index); + return unregister_console(con); } } return 0; -- cgit v1.2.3 From ddc1637af217dbd8bc51f30e6d24e84476a869a6 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 3 Jul 2009 17:34:24 +0800 Subject: kmemtrace: Print binary output only if 'bin' option is set Currently by default the output of kmemtrace is binary format instead of human-readable output. This patch makes the following changes: - We'll see human-readable output by default - We'll see binary output if 'bin' option is set Note: you may probably need to explicitly disable context-info binary output: # echo 0 > options/context-info # echo 1 > options/bin # cat trace_pipe v2: - use %pF to print call_site Signed-off-by: Li Zefan Acked-by: Pekka Enberg Acked-by: Eduard - Gabriel Munteanu Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A4DD0A0.5060500@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/kmemtrace.c | 120 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 90 insertions(+), 30 deletions(-) diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 1edaa9516e81..74903b62bcb6 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -239,12 +239,52 @@ struct kmemtrace_user_event_alloc { }; static enum print_line_t -kmemtrace_print_alloc_user(struct trace_iterator *iter, - struct kmemtrace_alloc_entry *entry) +kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags) +{ + struct trace_seq *s = &iter->seq; + struct kmemtrace_alloc_entry *entry; + int ret; + + trace_assign_type(entry, iter->ent); + + ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu " + "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n", + entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr, + (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc, + (unsigned long)entry->gfp_flags, entry->node); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +kmemtrace_print_free_user(struct trace_iterator *iter, int flags) { - struct kmemtrace_user_event_alloc *ev_alloc; struct trace_seq *s = &iter->seq; + struct kmemtrace_free_entry *entry; + int ret; + + trace_assign_type(entry, iter->ent); + + ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n", + entry->type_id, (void *)entry->call_site, + (unsigned long)entry->ptr); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +kmemtrace_print_alloc_user_bin(struct trace_iterator *iter, int flags) +{ + struct trace_seq *s = &iter->seq; + struct kmemtrace_alloc_entry *entry; struct kmemtrace_user_event *ev; + struct kmemtrace_user_event_alloc *ev_alloc; + + trace_assign_type(entry, iter->ent); ev = trace_seq_reserve(s, sizeof(*ev)); if (!ev) @@ -271,12 +311,14 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter, } static enum print_line_t -kmemtrace_print_free_user(struct trace_iterator *iter, - struct kmemtrace_free_entry *entry) +kmemtrace_print_free_user_bin(struct trace_iterator *iter, int flags) { struct trace_seq *s = &iter->seq; + struct kmemtrace_free_entry *entry; struct kmemtrace_user_event *ev; + trace_assign_type(entry, iter->ent); + ev = trace_seq_reserve(s, sizeof(*ev)); if (!ev) return TRACE_TYPE_PARTIAL_LINE; @@ -294,12 +336,14 @@ kmemtrace_print_free_user(struct trace_iterator *iter, /* The two other following provide a more minimalistic output */ static enum print_line_t -kmemtrace_print_alloc_compress(struct trace_iterator *iter, - struct kmemtrace_alloc_entry *entry) +kmemtrace_print_alloc_compress(struct trace_iterator *iter) { + struct kmemtrace_alloc_entry *entry; struct trace_seq *s = &iter->seq; int ret; + trace_assign_type(entry, iter->ent); + /* Alloc entry */ ret = trace_seq_printf(s, " + "); if (!ret) @@ -362,12 +406,14 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter, } static enum print_line_t -kmemtrace_print_free_compress(struct trace_iterator *iter, - struct kmemtrace_free_entry *entry) +kmemtrace_print_free_compress(struct trace_iterator *iter) { + struct kmemtrace_free_entry *entry; struct trace_seq *s = &iter->seq; int ret; + trace_assign_type(entry, iter->ent); + /* Free entry */ ret = trace_seq_printf(s, " - "); if (!ret) @@ -421,32 +467,31 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter) { struct trace_entry *entry = iter->ent; - switch (entry->type) { - case TRACE_KMEM_ALLOC: { - struct kmemtrace_alloc_entry *field; - - trace_assign_type(field, entry); - if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL) - return kmemtrace_print_alloc_compress(iter, field); - else - return kmemtrace_print_alloc_user(iter, field); - } - - case TRACE_KMEM_FREE: { - struct kmemtrace_free_entry *field; - - trace_assign_type(field, entry); - if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL) - return kmemtrace_print_free_compress(iter, field); - else - return kmemtrace_print_free_user(iter, field); - } + if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)) + return TRACE_TYPE_UNHANDLED; + switch (entry->type) { + case TRACE_KMEM_ALLOC: + return kmemtrace_print_alloc_compress(iter); + case TRACE_KMEM_FREE: + return kmemtrace_print_free_compress(iter); default: return TRACE_TYPE_UNHANDLED; } } +static struct trace_event kmem_trace_alloc = { + .type = TRACE_KMEM_ALLOC, + .trace = kmemtrace_print_alloc_user, + .binary = kmemtrace_print_alloc_user_bin, +}; + +static struct trace_event kmem_trace_free = { + .type = TRACE_KMEM_FREE, + .trace = kmemtrace_print_free_user, + .binary = kmemtrace_print_free_user_bin, +}; + static struct tracer kmem_tracer __read_mostly = { .name = "kmemtrace", .init = kmem_trace_init, @@ -463,6 +508,21 @@ void kmemtrace_init(void) static int __init init_kmem_tracer(void) { - return register_tracer(&kmem_tracer); + if (!register_ftrace_event(&kmem_trace_alloc)) { + pr_warning("Warning: could not register kmem events\n"); + return 1; + } + + if (!register_ftrace_event(&kmem_trace_free)) { + pr_warning("Warning: could not register kmem events\n"); + return 1; + } + + if (!register_tracer(&kmem_tracer)) { + pr_warning("Warning: could not register the kmem tracer\n"); + return 1; + } + + return 0; } device_initcall(init_kmem_tracer); -- cgit v1.2.3 From 42204455f160dab0c47f19e1be23f5c927af2d17 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 4 Jul 2009 07:50:00 +0530 Subject: x86: Clean up mtrr/amd.c: Fix trivial style problems : ERROR: trailing whitespace WARNING: line over 80 characters ERROR: do not use C99 // comments arch/x86/kernel/cpu/mtrr/amd.o: text data bss dec hex filename 501 32 0 533 215 amd.o.before 501 32 0 533 215 amd.o.after md5: 62f795eb840ee2d17b03df89e789e76c amd.o.before.asm 62f795eb840ee2d17b03df89e789e76c amd.o.after.asm Suggested-by: Alan Cox Signed-off-by: Jaswinder Singh Rajput Cc: Andrew Morton Cc: Yinghai Lu LKML-Reference: <20090703164225.GA21447@elte.hu> [ Also restructured comments to be standard, removed stray return, converted function description to DocBook style, etc. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/amd.c | 97 ++++++++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 46 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c index ee2331b0e58f..33af14110dfd 100644 --- a/arch/x86/kernel/cpu/mtrr/amd.c +++ b/arch/x86/kernel/cpu/mtrr/amd.c @@ -7,15 +7,15 @@ static void amd_get_mtrr(unsigned int reg, unsigned long *base, - unsigned long *size, mtrr_type * type) + unsigned long *size, mtrr_type *type) { unsigned long low, high; rdmsr(MSR_K6_UWCCR, low, high); - /* Upper dword is region 1, lower is region 0 */ + /* Upper dword is region 1, lower is region 0 */ if (reg == 1) low = high; - /* The base masks off on the right alignment */ + /* The base masks off on the right alignment */ *base = (low & 0xFFFE0000) >> PAGE_SHIFT; *type = 0; if (low & 1) @@ -27,74 +27,81 @@ amd_get_mtrr(unsigned int reg, unsigned long *base, return; } /* - * This needs a little explaining. The size is stored as an - * inverted mask of bits of 128K granularity 15 bits long offset - * 2 bits + * This needs a little explaining. The size is stored as an + * inverted mask of bits of 128K granularity 15 bits long offset + * 2 bits. * - * So to get a size we do invert the mask and add 1 to the lowest - * mask bit (4 as its 2 bits in). This gives us a size we then shift - * to turn into 128K blocks + * So to get a size we do invert the mask and add 1 to the lowest + * mask bit (4 as its 2 bits in). This gives us a size we then shift + * to turn into 128K blocks. * - * eg 111 1111 1111 1100 is 512K + * eg 111 1111 1111 1100 is 512K * - * invert 000 0000 0000 0011 - * +1 000 0000 0000 0100 - * *128K ... + * invert 000 0000 0000 0011 + * +1 000 0000 0000 0100 + * *128K ... */ low = (~low) & 0x1FFFC; *size = (low + 4) << (15 - PAGE_SHIFT); - return; } -static void amd_set_mtrr(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type) -/* [SUMMARY] Set variable MTRR register on the local CPU. - The register to set. - The base address of the region. - The size of the region. If this is 0 the region is disabled. - The type of the region. - [RETURNS] Nothing. -*/ +/** + * amd_set_mtrr - Set variable MTRR register on the local CPU. + * + * @reg The register to set. + * @base The base address of the region. + * @size The size of the region. If this is 0 the region is disabled. + * @type The type of the region. + * + * Returns nothing. + */ +static void +amd_set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) { u32 regs[2]; /* - * Low is MTRR0 , High MTRR 1 + * Low is MTRR0, High MTRR 1 */ rdmsr(MSR_K6_UWCCR, regs[0], regs[1]); /* - * Blank to disable + * Blank to disable */ - if (size == 0) + if (size == 0) { regs[reg] = 0; - else - /* Set the register to the base, the type (off by one) and an - inverted bitmask of the size The size is the only odd - bit. We are fed say 512K We invert this and we get 111 1111 - 1111 1011 but if you subtract one and invert you get the - desired 111 1111 1111 1100 mask - - But ~(x - 1) == ~x + 1 == -x. Two's complement rocks! */ + } else { + /* + * Set the register to the base, the type (off by one) and an + * inverted bitmask of the size The size is the only odd + * bit. We are fed say 512K We invert this and we get 111 1111 + * 1111 1011 but if you subtract one and invert you get the + * desired 111 1111 1111 1100 mask + * + * But ~(x - 1) == ~x + 1 == -x. Two's complement rocks! + */ regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC) | (base << PAGE_SHIFT) | (type + 1); + } /* - * The writeback rule is quite specific. See the manual. Its - * disable local interrupts, write back the cache, set the mtrr + * The writeback rule is quite specific. See the manual. Its + * disable local interrupts, write back the cache, set the mtrr */ wbinvd(); wrmsr(MSR_K6_UWCCR, regs[0], regs[1]); } -static int amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) +static int +amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) { - /* Apply the K6 block alignment and size rules - In order - o Uncached or gathering only - o 128K or bigger block - o Power of 2 block - o base suitably aligned to the power - */ + /* + * Apply the K6 block alignment and size rules + * In order + * o Uncached or gathering only + * o 128K or bigger block + * o Power of 2 block + * o base suitably aligned to the power + */ if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT)) || (size & ~(size - 1)) - size || (base & (size - 1))) return -EINVAL; @@ -115,5 +122,3 @@ int __init amd_init_mtrr(void) set_mtrr_ops(&amd_mtrr_ops); return 0; } - -//arch_initcall(amd_mtrr_init); -- cgit v1.2.3 From 6c4caa1ab737502190e416b76e6c10d2bf24276a Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 4 Jul 2009 07:50:44 +0530 Subject: x86: Clean up mtrr/centaur.c Remove dead code and fix trivial style problems: ERROR: trailing whitespace X 2 WARNING: line over 80 characters X 3 ROR: trailing whitespace ERROR: do not use C99 // comments X 2 arch/x86/kernel/cpu/mtrr/centaur.o: text data bss dec hex filename 605 32 68 705 2c1 centaur.o.before 605 32 68 705 2c1 centaur.o.after md5: a4865ea98ce3c163bb1d376a3949b3e3 centaur.o.before.asm a4865ea98ce3c163bb1d376a3949b3e3 centaur.o.after.asm Suggested-by: Alan Cox Signed-off-by: Jaswinder Singh Rajput Cc: Andrew Morton Cc: Yinghai Lu LKML-Reference: <20090703164225.GA21447@elte.hu> [ Standardized comments, DocBook, curly braces, newlines. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/centaur.c | 168 ++++++++----------------------------- 1 file changed, 35 insertions(+), 133 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c index cb9aa3a7a7ab..de89f14eff3a 100644 --- a/arch/x86/kernel/cpu/mtrr/centaur.c +++ b/arch/x86/kernel/cpu/mtrr/centaur.c @@ -1,7 +1,9 @@ #include #include + #include #include + #include "mtrr.h" static struct { @@ -12,25 +14,25 @@ static struct { static u8 centaur_mcr_reserved; static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */ -/* - * Report boot time MCR setups +/** + * centaur_get_free_region - Get a free MTRR. + * + * @base: The starting (base) address of the region. + * @size: The size (in bytes) of the region. + * + * Returns: the index of the region on success, else -1 on error. */ - static int centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg) -/* [SUMMARY] Get a free MTRR. - The starting (base) address of the region. - The size (in bytes) of the region. - [RETURNS] The index of the region on success, else -1 on error. -*/ { - int i, max; - mtrr_type ltype; unsigned long lbase, lsize; + mtrr_type ltype; + int i, max; max = num_var_ranges; if (replace_reg >= 0 && replace_reg < max) return replace_reg; + for (i = 0; i < max; ++i) { if (centaur_mcr_reserved & (1 << i)) continue; @@ -38,11 +40,14 @@ centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg) if (lsize == 0) return i; } + return -ENOSPC; } -void -mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) +/* + * Report boot time MCR setups + */ +void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) { centaur_mcr[mcr].low = lo; centaur_mcr[mcr].high = hi; @@ -54,33 +59,35 @@ centaur_get_mcr(unsigned int reg, unsigned long *base, { *base = centaur_mcr[reg].high >> PAGE_SHIFT; *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT; - *type = MTRR_TYPE_WRCOMB; /* If it is there, it is write-combining */ + *type = MTRR_TYPE_WRCOMB; /* write-combining */ + if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2)) *type = MTRR_TYPE_UNCACHABLE; if (centaur_mcr_type == 1 && (centaur_mcr[reg].low & 31) == 25) *type = MTRR_TYPE_WRBACK; if (centaur_mcr_type == 0 && (centaur_mcr[reg].low & 31) == 31) *type = MTRR_TYPE_WRBACK; - } -static void centaur_set_mcr(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type) +static void +centaur_set_mcr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) { unsigned long low, high; if (size == 0) { - /* Disable */ + /* Disable */ high = low = 0; } else { high = base << PAGE_SHIFT; - if (centaur_mcr_type == 0) - low = -size << PAGE_SHIFT | 0x1f; /* only support write-combining... */ - else { + if (centaur_mcr_type == 0) { + /* Only support write-combining... */ + low = -size << PAGE_SHIFT | 0x1f; + } else { if (type == MTRR_TYPE_UNCACHABLE) - low = -size << PAGE_SHIFT | 0x02; /* NC */ + low = -size << PAGE_SHIFT | 0x02; /* NC */ else - low = -size << PAGE_SHIFT | 0x09; /* WWO,WC */ + low = -size << PAGE_SHIFT | 0x09; /* WWO, WC */ } } centaur_mcr[reg].high = high; @@ -88,118 +95,16 @@ static void centaur_set_mcr(unsigned int reg, unsigned long base, wrmsr(MSR_IDT_MCR0 + reg, low, high); } -#if 0 -/* - * Initialise the later (saner) Winchip MCR variant. In this version - * the BIOS can pass us the registers it has used (but not their values) - * and the control register is read/write - */ - -static void __init -centaur_mcr1_init(void) -{ - unsigned i; - u32 lo, hi; - - /* Unfortunately, MCR's are read-only, so there is no way to - * find out what the bios might have done. - */ - - rdmsr(MSR_IDT_MCR_CTRL, lo, hi); - if (((lo >> 17) & 7) == 1) { /* Type 1 Winchip2 MCR */ - lo &= ~0x1C0; /* clear key */ - lo |= 0x040; /* set key to 1 */ - wrmsr(MSR_IDT_MCR_CTRL, lo, hi); /* unlock MCR */ - } - - centaur_mcr_type = 1; - - /* - * Clear any unconfigured MCR's. - */ - - for (i = 0; i < 8; ++i) { - if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0) { - if (!(lo & (1 << (9 + i)))) - wrmsr(MSR_IDT_MCR0 + i, 0, 0); - else - /* - * If the BIOS set up an MCR we cannot see it - * but we don't wish to obliterate it - */ - centaur_mcr_reserved |= (1 << i); - } - } - /* - * Throw the main write-combining switch... - * However if OOSTORE is enabled then people have already done far - * cleverer things and we should behave. - */ - - lo |= 15; /* Write combine enables */ - wrmsr(MSR_IDT_MCR_CTRL, lo, hi); -} - -/* - * Initialise the original winchip with read only MCR registers - * no used bitmask for the BIOS to pass on and write only control - */ - -static void __init -centaur_mcr0_init(void) -{ - unsigned i; - - /* Unfortunately, MCR's are read-only, so there is no way to - * find out what the bios might have done. - */ - - /* Clear any unconfigured MCR's. - * This way we are sure that the centaur_mcr array contains the actual - * values. The disadvantage is that any BIOS tweaks are thus undone. - * - */ - for (i = 0; i < 8; ++i) { - if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0) - wrmsr(MSR_IDT_MCR0 + i, 0, 0); - } - - wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0); /* Write only */ -} - -/* - * Initialise Winchip series MCR registers - */ - -static void __init -centaur_mcr_init(void) -{ - struct set_mtrr_context ctxt; - - set_mtrr_prepare_save(&ctxt); - set_mtrr_cache_disable(&ctxt); - - if (boot_cpu_data.x86_model == 4) - centaur_mcr0_init(); - else if (boot_cpu_data.x86_model == 8 || boot_cpu_data.x86_model == 9) - centaur_mcr1_init(); - - set_mtrr_done(&ctxt); -} -#endif - -static int centaur_validate_add_page(unsigned long base, - unsigned long size, unsigned int type) +static int +centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int type) { /* - * FIXME: Winchip2 supports uncached + * FIXME: Winchip2 supports uncached */ - if (type != MTRR_TYPE_WRCOMB && + if (type != MTRR_TYPE_WRCOMB && (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) { - printk(KERN_WARNING - "mtrr: only write-combining%s supported\n", - centaur_mcr_type ? " and uncacheable are" - : " is"); + pr_warning("mtrr: only write-combining%s supported\n", + centaur_mcr_type ? " and uncacheable are" : " is"); return -EINVAL; } return 0; @@ -207,7 +112,6 @@ static int centaur_validate_add_page(unsigned long base, static struct mtrr_ops centaur_mtrr_ops = { .vendor = X86_VENDOR_CENTAUR, -// .init = centaur_mcr_init, .set = centaur_set_mcr, .get = centaur_get_mcr, .get_free_region = centaur_get_free_region, @@ -220,5 +124,3 @@ int __init centaur_init_mtrr(void) set_mtrr_ops(¢aur_mtrr_ops); return 0; } - -//arch_initcall(centaur_init_mtrr); -- cgit v1.2.3 From 63f9600fadb10ea739108ae93e3e842d9843c58b Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 4 Jul 2009 07:51:32 +0530 Subject: x86: Clean up mtrr/cleanup.c Fix trivial style problems: WARNING: Use #include instead of WARNING: Use #include instead of Also, nr_mtrr_spare_reg should be unsigned long. arch/x86/kernel/cpu/mtrr/cleanup.o: text data bss dec hex filename 6241 8992 2056 17289 4389 cleanup.o.before 6241 8992 2056 17289 4389 cleanup.o.after The md5 has changed: 1a7a27513aef1825236daf29110fe657 cleanup.o.before.asm bcea358efa2532b6020e338e158447af cleanup.o.after.asm Because a WARN_ON()'s __LINE__ value changed by 3 lines. Suggested-by: Alan Cox Signed-off-by: Jaswinder Singh Rajput Cc: Andrew Morton Cc: Yinghai Lu LKML-Reference: <20090703164225.GA21447@elte.hu> [ Did lots of other cleanups to make the code look more consistent. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/cleanup.c | 350 +++++++++++++++++++------------------ 1 file changed, 176 insertions(+), 174 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 1d584a18a50d..b8aba811b60e 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -1,51 +1,52 @@ -/* MTRR (Memory Type Range Register) cleanup - - Copyright (C) 2009 Yinghai Lu - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Library General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Library General Public License for more details. - - You should have received a copy of the GNU Library General Public - License along with this library; if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - +/* + * MTRR (Memory Type Range Register) cleanup + * + * Copyright (C) 2009 Yinghai Lu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ #include #include #include #include #include -#include #include +#include +#include +#include +#include #include #include -#include -#include #include -#include + #include "mtrr.h" -/* should be related to MTRR_VAR_RANGES nums */ +/* Should be related to MTRR_VAR_RANGES nums */ #define RANGE_NUM 256 struct res_range { - unsigned long start; - unsigned long end; + unsigned long start; + unsigned long end; }; static int __init -add_range(struct res_range *range, int nr_range, unsigned long start, - unsigned long end) +add_range(struct res_range *range, int nr_range, + unsigned long start, unsigned long end) { - /* out of slots */ + /* Out of slots: */ if (nr_range >= RANGE_NUM) return nr_range; @@ -58,12 +59,12 @@ add_range(struct res_range *range, int nr_range, unsigned long start, } static int __init -add_range_with_merge(struct res_range *range, int nr_range, unsigned long start, - unsigned long end) +add_range_with_merge(struct res_range *range, int nr_range, + unsigned long start, unsigned long end) { int i; - /* try to merge it with old one */ + /* Try to merge it with old one: */ for (i = 0; i < nr_range; i++) { unsigned long final_start, final_end; unsigned long common_start, common_end; @@ -84,7 +85,7 @@ add_range_with_merge(struct res_range *range, int nr_range, unsigned long start, return nr_range; } - /* need to add that */ + /* Need to add it: */ return add_range(range, nr_range, start, end); } @@ -117,7 +118,7 @@ subtract_range(struct res_range *range, unsigned long start, unsigned long end) } if (start > range[j].start && end < range[j].end) { - /* find the new spare */ + /* Find the new spare: */ for (i = 0; i < RANGE_NUM; i++) { if (range[i].end == 0) break; @@ -147,13 +148,19 @@ static int __init cmp_range(const void *x1, const void *x2) } struct var_mtrr_range_state { - unsigned long base_pfn; - unsigned long size_pfn; - mtrr_type type; + unsigned long base_pfn; + unsigned long size_pfn; + mtrr_type type; }; static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; + static int __initdata debug_print; +#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) + + +#define BIOS_BUG_MSG KERN_WARNING \ + "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" static int __init x86_get_mtrr_mem_range(struct res_range *range, int nr_range, @@ -180,7 +187,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, range[i].start, range[i].end + 1); } - /* take out UC ranges */ + /* Take out UC ranges: */ for (i = 0; i < num_var_ranges; i++) { type = range_state[i].type; if (type != MTRR_TYPE_UNCACHABLE && @@ -244,10 +251,9 @@ static int __initdata nr_range; static unsigned long __init sum_ranges(struct res_range *range, int nr_range) { - unsigned long sum; + unsigned long sum = 0; int i; - sum = 0; for (i = 0; i < nr_range; i++) sum += range[i].end + 1 - range[i].start; @@ -288,7 +294,7 @@ struct var_mtrr_state { static void __init set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, - unsigned char type, unsigned int address_bits) + unsigned char type, unsigned int address_bits) { u32 base_lo, base_hi, mask_lo, mask_hi; u64 base, mask; @@ -301,7 +307,7 @@ set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, mask = (1ULL << address_bits) - 1; mask &= ~((((u64)sizek) << 10) - 1); - base = ((u64)basek) << 10; + base = ((u64)basek) << 10; base |= type; mask |= 0x800; @@ -317,15 +323,14 @@ set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, static void __init save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, - unsigned char type) + unsigned char type) { range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10); range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10); range_state[reg].type = type; } -static void __init -set_var_mtrr_all(unsigned int address_bits) +static void __init set_var_mtrr_all(unsigned int address_bits) { unsigned long basek, sizek; unsigned char type; @@ -342,11 +347,11 @@ set_var_mtrr_all(unsigned int address_bits) static unsigned long to_size_factor(unsigned long sizek, char *factorp) { - char factor; unsigned long base = sizek; + char factor; if (base & ((1<<10) - 1)) { - /* not MB alignment */ + /* Not MB-aligned: */ factor = 'K'; } else if (base & ((1<<20) - 1)) { factor = 'M'; @@ -372,11 +377,12 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, unsigned long max_align, align; unsigned long sizek; - /* Compute the maximum size I can make a range */ + /* Compute the maximum size with which we can make a range: */ if (range_startk) max_align = ffs(range_startk) - 1; else max_align = 32; + align = fls(range_sizek) - 1; if (align > max_align) align = max_align; @@ -386,11 +392,10 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, char start_factor = 'K', size_factor = 'K'; unsigned long start_base, size_base; - start_base = to_size_factor(range_startk, - &start_factor), - size_base = to_size_factor(sizek, &size_factor), + start_base = to_size_factor(range_startk, &start_factor); + size_base = to_size_factor(sizek, &size_factor); - printk(KERN_DEBUG "Setting variable MTRR %d, " + Dprintk("Setting variable MTRR %d, " "base: %ld%cB, range: %ld%cB, type %s\n", reg, start_base, start_factor, size_base, size_factor, @@ -425,10 +430,11 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, chunk_sizek = state->chunk_sizek; gran_sizek = state->gran_sizek; - /* align with gran size, prevent small block used up MTRRs */ + /* Align with gran size, prevent small block used up MTRRs: */ range_basek = ALIGN(state->range_startk, gran_sizek); if ((range_basek > basek) && basek) return second_sizek; + state->range_sizek -= (range_basek - state->range_startk); range_sizek = ALIGN(state->range_sizek, gran_sizek); @@ -439,22 +445,21 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, } state->range_sizek = range_sizek; - /* try to append some small hole */ + /* Try to append some small hole: */ range0_basek = state->range_startk; range0_sizek = ALIGN(state->range_sizek, chunk_sizek); - /* no increase */ + /* No increase: */ if (range0_sizek == state->range_sizek) { - if (debug_print) - printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", - range0_basek<<10, - (range0_basek + state->range_sizek)<<10); + Dprintk("rangeX: %016lx - %016lx\n", + range0_basek<<10, + (range0_basek + state->range_sizek)<<10); state->reg = range_to_mtrr(state->reg, range0_basek, state->range_sizek, MTRR_TYPE_WRBACK); return 0; } - /* only cut back, when it is not the last */ + /* Only cut back when it is not the last: */ if (sizek) { while (range0_basek + range0_sizek > (basek + sizek)) { if (range0_sizek >= chunk_sizek) @@ -470,16 +475,16 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, second_try: range_basek = range0_basek + range0_sizek; - /* one hole in the middle */ + /* One hole in the middle: */ if (range_basek > basek && range_basek <= (basek + sizek)) second_sizek = range_basek - basek; if (range0_sizek > state->range_sizek) { - /* one hole in middle or at end */ + /* One hole in middle or at the end: */ hole_sizek = range0_sizek - state->range_sizek - second_sizek; - /* hole size should be less than half of range0 size */ + /* Hole size should be less than half of range0 size: */ if (hole_sizek >= (range0_sizek >> 1) && range0_sizek >= chunk_sizek) { range0_sizek -= chunk_sizek; @@ -491,32 +496,30 @@ second_try: } if (range0_sizek) { - if (debug_print) - printk(KERN_DEBUG "range0: %016lx - %016lx\n", - range0_basek<<10, - (range0_basek + range0_sizek)<<10); + Dprintk("range0: %016lx - %016lx\n", + range0_basek<<10, + (range0_basek + range0_sizek)<<10); state->reg = range_to_mtrr(state->reg, range0_basek, range0_sizek, MTRR_TYPE_WRBACK); } if (range0_sizek < state->range_sizek) { - /* need to handle left over */ + /* Need to handle left over range: */ range_sizek = state->range_sizek - range0_sizek; - if (debug_print) - printk(KERN_DEBUG "range: %016lx - %016lx\n", - range_basek<<10, - (range_basek + range_sizek)<<10); + Dprintk("range: %016lx - %016lx\n", + range_basek<<10, + (range_basek + range_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range_basek, range_sizek, MTRR_TYPE_WRBACK); } if (hole_sizek) { hole_basek = range_basek - hole_sizek - second_sizek; - if (debug_print) - printk(KERN_DEBUG "hole: %016lx - %016lx\n", - hole_basek<<10, - (hole_basek + hole_sizek)<<10); + Dprintk("hole: %016lx - %016lx\n", + hole_basek<<10, + (hole_basek + hole_sizek)<<10); state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek, MTRR_TYPE_UNCACHABLE); } @@ -537,23 +540,23 @@ set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, basek = base_pfn << (PAGE_SHIFT - 10); sizek = size_pfn << (PAGE_SHIFT - 10); - /* See if I can merge with the last range */ + /* See if I can merge with the last range: */ if ((basek <= 1024) || (state->range_startk + state->range_sizek == basek)) { unsigned long endk = basek + sizek; state->range_sizek = endk - state->range_startk; return; } - /* Write the range mtrrs */ + /* Write the range mtrrs: */ if (state->range_sizek != 0) second_sizek = range_to_mtrr_with_hole(state, basek, sizek); - /* Allocate an msr */ + /* Allocate an msr: */ state->range_startk = basek + second_sizek; state->range_sizek = sizek - second_sizek; } -/* mininum size of mtrr block that can take hole */ +/* Mininum size of mtrr block that can take hole: */ static u64 mtrr_chunk_size __initdata = (256ULL<<20); static int __init parse_mtrr_chunk_size_opt(char *p) @@ -565,7 +568,7 @@ static int __init parse_mtrr_chunk_size_opt(char *p) } early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt); -/* granity of mtrr of block */ +/* Granularity of mtrr of block: */ static u64 mtrr_gran_size __initdata; static int __init parse_mtrr_gran_size_opt(char *p) @@ -577,7 +580,7 @@ static int __init parse_mtrr_gran_size_opt(char *p) } early_param("mtrr_gran_size", parse_mtrr_gran_size_opt); -static int nr_mtrr_spare_reg __initdata = +static unsigned long nr_mtrr_spare_reg __initdata = CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT; static int __init parse_mtrr_spare_reg(char *arg) @@ -586,7 +589,6 @@ static int __init parse_mtrr_spare_reg(char *arg) nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0); return 0; } - early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); static int __init @@ -594,8 +596,8 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range, u64 chunk_size, u64 gran_size) { struct var_mtrr_state var_state; - int i; int num_reg; + int i; var_state.range_startk = 0; var_state.range_sizek = 0; @@ -605,17 +607,18 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range, memset(range_state, 0, sizeof(range_state)); - /* Write the range etc */ - for (i = 0; i < nr_range; i++) + /* Write the range: */ + for (i = 0; i < nr_range; i++) { set_var_mtrr_range(&var_state, range[i].start, range[i].end - range[i].start + 1); + } - /* Write the last range */ + /* Write the last range: */ if (var_state.range_sizek != 0) range_to_mtrr_with_hole(&var_state, 0, 0); num_reg = var_state.reg; - /* Clear out the extra MTRR's */ + /* Clear out the extra MTRR's: */ while (var_state.reg < num_var_ranges) { save_var_mtrr(var_state.reg, 0, 0, 0); var_state.reg++; @@ -625,11 +628,11 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range, } struct mtrr_cleanup_result { - unsigned long gran_sizek; - unsigned long chunk_sizek; - unsigned long lose_cover_sizek; - unsigned int num_reg; - int bad; + unsigned long gran_sizek; + unsigned long chunk_sizek; + unsigned long lose_cover_sizek; + unsigned int num_reg; + int bad; }; /* @@ -645,10 +648,10 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM]; static void __init print_out_mtrr_range_state(void) { - int i; char start_factor = 'K', size_factor = 'K'; unsigned long start_base, size_base; mtrr_type type; + int i; for (i = 0; i < num_var_ranges; i++) { @@ -676,10 +679,10 @@ static int __init mtrr_need_cleanup(void) int i; mtrr_type type; unsigned long size; - /* extra one for all 0 */ + /* Extra one for all 0: */ int num[MTRR_NUM_TYPES + 1]; - /* check entries number */ + /* Check entries number: */ memset(num, 0, sizeof(num)); for (i = 0; i < num_var_ranges; i++) { type = range_state[i].type; @@ -693,88 +696,86 @@ static int __init mtrr_need_cleanup(void) num[type]++; } - /* check if we got UC entries */ + /* Check if we got UC entries: */ if (!num[MTRR_TYPE_UNCACHABLE]) return 0; - /* check if we only had WB and UC */ + /* Check if we only had WB and UC */ if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != - num_var_ranges - num[MTRR_NUM_TYPES]) + num_var_ranges - num[MTRR_NUM_TYPES]) return 0; return 1; } static unsigned long __initdata range_sums; -static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size, - unsigned long extra_remove_base, - unsigned long extra_remove_size, - int i) + +static void __init +mtrr_calc_range_state(u64 chunk_size, u64 gran_size, + unsigned long x_remove_base, + unsigned long x_remove_size, int i) { - int num_reg; static struct res_range range_new[RANGE_NUM]; - static int nr_range_new; unsigned long range_sums_new; + static int nr_range_new; + int num_reg; - /* convert ranges to var ranges state */ - num_reg = x86_setup_var_mtrrs(range, nr_range, - chunk_size, gran_size); + /* Convert ranges to var ranges state: */ + num_reg = x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); - /* we got new setting in range_state, check it */ + /* We got new setting in range_state, check it: */ memset(range_new, 0, sizeof(range_new)); nr_range_new = x86_get_mtrr_mem_range(range_new, 0, - extra_remove_base, extra_remove_size); + x_remove_base, x_remove_size); range_sums_new = sum_ranges(range_new, nr_range_new); result[i].chunk_sizek = chunk_size >> 10; result[i].gran_sizek = gran_size >> 10; result[i].num_reg = num_reg; + if (range_sums < range_sums_new) { - result[i].lose_cover_sizek = - (range_sums_new - range_sums) << PSHIFT; + result[i].lose_cover_sizek = (range_sums_new - range_sums) << PSHIFT; result[i].bad = 1; - } else - result[i].lose_cover_sizek = - (range_sums - range_sums_new) << PSHIFT; + } else { + result[i].lose_cover_sizek = (range_sums - range_sums_new) << PSHIFT; + } - /* double check it */ + /* Double check it: */ if (!result[i].bad && !result[i].lose_cover_sizek) { - if (nr_range_new != nr_range || - memcmp(range, range_new, sizeof(range))) - result[i].bad = 1; + if (nr_range_new != nr_range || memcmp(range, range_new, sizeof(range))) + result[i].bad = 1; } - if (!result[i].bad && (range_sums - range_sums_new < - min_loss_pfn[num_reg])) { - min_loss_pfn[num_reg] = - range_sums - range_sums_new; - } + if (!result[i].bad && (range_sums - range_sums_new < min_loss_pfn[num_reg])) + min_loss_pfn[num_reg] = range_sums - range_sums_new; } static void __init mtrr_print_out_one_result(int i) { - char gran_factor, chunk_factor, lose_factor; unsigned long gran_base, chunk_base, lose_base; + char gran_factor, chunk_factor, lose_factor; gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), - printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", - result[i].bad ? "*BAD*" : " ", - gran_base, gran_factor, chunk_base, chunk_factor); - printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", - result[i].num_reg, result[i].bad ? "-" : "", - lose_base, lose_factor); + + pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", + result[i].bad ? "*BAD*" : " ", + gran_base, gran_factor, chunk_base, chunk_factor); + pr_cont("num_reg: %d \tlose cover RAM: %s%ld%c\n", + result[i].num_reg, result[i].bad ? "-" : "", + lose_base, lose_factor); } static int __init mtrr_search_optimal_index(void) { - int i; int num_reg_good; int index_good; + int i; if (nr_mtrr_spare_reg >= num_var_ranges) nr_mtrr_spare_reg = num_var_ranges - 1; + num_reg_good = -1; for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { if (!min_loss_pfn[i]) @@ -796,24 +797,24 @@ static int __init mtrr_search_optimal_index(void) return index_good; } - int __init mtrr_cleanup(unsigned address_bits) { - unsigned long extra_remove_base, extra_remove_size; + unsigned long x_remove_base, x_remove_size; unsigned long base, size, def, dummy; - mtrr_type type; u64 chunk_size, gran_size; + mtrr_type type; int index_good; int i; if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) return 0; + rdmsr(MSR_MTRRdefType, def, dummy); def &= 0xff; if (def != MTRR_TYPE_UNCACHABLE) return 0; - /* get it and store it aside */ + /* Get it and store it aside: */ memset(range_state, 0, sizeof(range_state)); for (i = 0; i < num_var_ranges; i++) { mtrr_if->get(i, &base, &size, &type); @@ -822,29 +823,28 @@ int __init mtrr_cleanup(unsigned address_bits) range_state[i].type = type; } - /* check if we need handle it and can handle it */ + /* Check if we need handle it and can handle it: */ if (!mtrr_need_cleanup()) return 0; - /* print original var MTRRs at first, for debugging: */ + /* Print original var MTRRs at first, for debugging: */ printk(KERN_DEBUG "original variable MTRRs\n"); print_out_mtrr_range_state(); memset(range, 0, sizeof(range)); - extra_remove_size = 0; - extra_remove_base = 1 << (32 - PAGE_SHIFT); + x_remove_size = 0; + x_remove_base = 1 << (32 - PAGE_SHIFT); if (mtrr_tom2) - extra_remove_size = - (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; - nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, - extra_remove_size); + x_remove_size = (mtrr_tom2 >> PAGE_SHIFT) - x_remove_base; + + nr_range = x86_get_mtrr_mem_range(range, 0, x_remove_base, x_remove_size); /* - * [0, 1M) should always be coverred by var mtrr with WB - * and fixed mtrrs should take effective before var mtrr for it + * [0, 1M) should always be covered by var mtrr with WB + * and fixed mtrrs should take effect before var mtrr for it: */ nr_range = add_range_with_merge(range, nr_range, 0, (1ULL<<(20 - PAGE_SHIFT)) - 1); - /* sort the ranges */ + /* Sort the ranges: */ sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); range_sums = sum_ranges(range, nr_range); @@ -854,7 +854,7 @@ int __init mtrr_cleanup(unsigned address_bits) if (mtrr_chunk_size && mtrr_gran_size) { i = 0; mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size, - extra_remove_base, extra_remove_size, i); + x_remove_base, x_remove_size, i); mtrr_print_out_one_result(i); @@ -880,7 +880,7 @@ int __init mtrr_cleanup(unsigned address_bits) continue; mtrr_calc_range_state(chunk_size, gran_size, - extra_remove_base, extra_remove_size, i); + x_remove_base, x_remove_size, i); if (debug_print) { mtrr_print_out_one_result(i); printk(KERN_INFO "\n"); @@ -890,7 +890,7 @@ int __init mtrr_cleanup(unsigned address_bits) } } - /* try to find the optimal index */ + /* Try to find the optimal index: */ index_good = mtrr_search_optimal_index(); if (index_good != -1) { @@ -898,7 +898,7 @@ int __init mtrr_cleanup(unsigned address_bits) i = index_good; mtrr_print_out_one_result(i); - /* convert ranges to var ranges state */ + /* Convert ranges to var ranges state: */ chunk_size = result[i].chunk_sizek; chunk_size <<= 10; gran_size = result[i].gran_sizek; @@ -941,8 +941,8 @@ early_param("disable_mtrr_trim", disable_mtrr_trim_setup); * Note this won't check if the MTRRs < 4GB where the magic bit doesn't * apply to are wrong, but so far we don't know of any such case in the wild. */ -#define Tom2Enabled (1U << 21) -#define Tom2ForceMemTypeWB (1U << 22) +#define Tom2Enabled (1U << 21) +#define Tom2ForceMemTypeWB (1U << 22) int __init amd_special_default_mtrr(void) { @@ -952,7 +952,7 @@ int __init amd_special_default_mtrr(void) return 0; if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) return 0; - /* In case some hypervisor doesn't pass SYSCFG through */ + /* In case some hypervisor doesn't pass SYSCFG through: */ if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) return 0; /* @@ -965,19 +965,21 @@ int __init amd_special_default_mtrr(void) return 0; } -static u64 __init real_trim_memory(unsigned long start_pfn, - unsigned long limit_pfn) +static u64 __init +real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn) { u64 trim_start, trim_size; + trim_start = start_pfn; trim_start <<= PAGE_SHIFT; + trim_size = limit_pfn; trim_size <<= PAGE_SHIFT; trim_size -= trim_start; - return e820_update_range(trim_start, trim_size, E820_RAM, - E820_RESERVED); + return e820_update_range(trim_start, trim_size, E820_RAM, E820_RESERVED); } + /** * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs * @end_pfn: ending page frame number @@ -985,7 +987,7 @@ static u64 __init real_trim_memory(unsigned long start_pfn, * Some buggy BIOSes don't setup the MTRRs properly for systems with certain * memory configurations. This routine checks that the highest MTRR matches * the end of memory, to make sure the MTRRs having a write back type cover - * all of the memory the kernel is intending to use. If not, it'll trim any + * all of the memory the kernel is intending to use. If not, it'll trim any * memory off the end by adjusting end_pfn, removing it from the kernel's * allocation pools, warning the user with an obnoxious message. */ @@ -994,21 +996,22 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) unsigned long i, base, size, highest_pfn = 0, def, dummy; mtrr_type type; u64 total_trim_size; - /* extra one for all 0 */ int num[MTRR_NUM_TYPES + 1]; + /* * Make sure we only trim uncachable memory on machines that * support the Intel MTRR architecture: */ if (!is_cpu(INTEL) || disable_mtrr_trim) return 0; + rdmsr(MSR_MTRRdefType, def, dummy); def &= 0xff; if (def != MTRR_TYPE_UNCACHABLE) return 0; - /* get it and store it aside */ + /* Get it and store it aside: */ memset(range_state, 0, sizeof(range_state)); for (i = 0; i < num_var_ranges; i++) { mtrr_if->get(i, &base, &size, &type); @@ -1017,7 +1020,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) range_state[i].type = type; } - /* Find highest cached pfn */ + /* Find highest cached pfn: */ for (i = 0; i < num_var_ranges; i++) { type = range_state[i].type; if (type != MTRR_TYPE_WRBACK) @@ -1028,13 +1031,13 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) highest_pfn = base + size; } - /* kvm/qemu doesn't have mtrr set right, don't trim them all */ + /* kvm/qemu doesn't have mtrr set right, don't trim them all: */ if (!highest_pfn) { printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n"); return 0; } - /* check entries number */ + /* Check entries number: */ memset(num, 0, sizeof(num)); for (i = 0; i < num_var_ranges; i++) { type = range_state[i].type; @@ -1046,11 +1049,11 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) num[type]++; } - /* no entry for WB? */ + /* No entry for WB? */ if (!num[MTRR_TYPE_WRBACK]) return 0; - /* check if we only had WB and UC */ + /* Check if we only had WB and UC: */ if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != num_var_ranges - num[MTRR_NUM_TYPES]) return 0; @@ -1066,31 +1069,31 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) } nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); + /* Check the head: */ total_trim_size = 0; - /* check the head */ if (range[0].start) total_trim_size += real_trim_memory(0, range[0].start); - /* check the holes */ + + /* Check the holes: */ for (i = 0; i < nr_range - 1; i++) { if (range[i].end + 1 < range[i+1].start) total_trim_size += real_trim_memory(range[i].end + 1, range[i+1].start); } - /* check the top */ + + /* Check the top: */ i = nr_range - 1; if (range[i].end + 1 < end_pfn) total_trim_size += real_trim_memory(range[i].end + 1, end_pfn); if (total_trim_size) { - printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" - " all of memory, losing %lluMB of RAM.\n", - total_trim_size >> 20); + pr_warning("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n", total_trim_size >> 20); if (!changed_by_mtrr_cleanup) WARN_ON(1); - printk(KERN_INFO "update e820 for mtrr\n"); + pr_info("update e820 for mtrr\n"); update_e820(); return 1; @@ -1098,4 +1101,3 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) return 0; } - -- cgit v1.2.3 From 2311037708c170977506fbcbe0a2ba0c6d221940 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 4 Jul 2009 07:52:08 +0530 Subject: x86: Clean up mtrr/cyrix.c Fix trivial style problems: WARNING: Use #include instead of WARNING: line over 80 characters ERROR: do not initialise statics to 0 or NULL ERROR: space prohibited after that open parenthesis '(' X 2 ERROR: space prohibited before that close parenthesis ')' X 2 ERROR: trailing whitespace X 2 ERROR: trailing statements should be on next line ERROR: do not use C99 // comments X 2 arch/x86/kernel/cpu/mtrr/cyrix.o: text data bss dec hex filename 1637 32 8 1677 68d cyrix.o.before 1637 32 8 1677 68d cyrix.o.after md5: 6f52abd06905be3f4cabb5239f9b0ff0 cyrix.o.before.asm 6f52abd06905be3f4cabb5239f9b0ff0 cyrix.o.after.asm Suggested-by: Alan Cox Signed-off-by: Jaswinder Singh Rajput Cc: Andrew Morton Cc: Yinghai Lu LKML-Reference: <20090703164225.GA21447@elte.hu> [ Made the code more consistent ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/cyrix.c | 94 ++++++++++++++++++++++------------------ 1 file changed, 51 insertions(+), 43 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index ff14c320040c..228d982ce09c 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -1,38 +1,40 @@ #include +#include #include -#include -#include -#include + #include #include +#include +#include + #include "mtrr.h" static void cyrix_get_arr(unsigned int reg, unsigned long *base, unsigned long *size, mtrr_type * type) { - unsigned long flags; unsigned char arr, ccr3, rcr, shift; + unsigned long flags; arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ - /* Save flags and disable interrupts */ local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ - ((unsigned char *) base)[3] = getCx86(arr); - ((unsigned char *) base)[2] = getCx86(arr + 1); - ((unsigned char *) base)[1] = getCx86(arr + 2); + ((unsigned char *)base)[3] = getCx86(arr); + ((unsigned char *)base)[2] = getCx86(arr + 1); + ((unsigned char *)base)[1] = getCx86(arr + 2); rcr = getCx86(CX86_RCR_BASE + reg); - setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ - /* Enable interrupts if it was enabled previously */ local_irq_restore(flags); + shift = ((unsigned char *) base)[1] & 0x0f; *base >>= PAGE_SHIFT; - /* Power of two, at least 4K on ARR0-ARR6, 256K on ARR7 + /* + * Power of two, at least 4K on ARR0-ARR6, 256K on ARR7 * Note: shift==0xf means 4G, this is unsupported. */ if (shift) @@ -76,17 +78,20 @@ cyrix_get_arr(unsigned int reg, unsigned long *base, } } +/* + * cyrix_get_free_region - get a free ARR. + * + * @base: the starting (base) address of the region. + * @size: the size (in bytes) of the region. + * + * Returns: the index of the region on success, else -1 on error. +*/ static int cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) -/* [SUMMARY] Get a free ARR. - The starting (base) address of the region. - The size (in bytes) of the region. - [RETURNS] The index of the region on success, else -1 on error. -*/ { - int i; - mtrr_type ltype; unsigned long lbase, lsize; + mtrr_type ltype; + int i; switch (replace_reg) { case 7: @@ -107,14 +112,17 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) cyrix_get_arr(7, &lbase, &lsize, <ype); if (lsize == 0) return 7; - /* Else try ARR0-ARR6 first */ + /* Else try ARR0-ARR6 first */ } else { for (i = 0; i < 7; i++) { cyrix_get_arr(i, &lbase, &lsize, <ype); if (lsize == 0) return i; } - /* ARR0-ARR6 isn't free, try ARR7 but its size must be at least 256K */ + /* + * ARR0-ARR6 isn't free + * try ARR7 but its size must be at least 256K + */ cyrix_get_arr(i, &lbase, &lsize, <ype); if ((lsize == 0) && (size >= 0x40)) return i; @@ -122,21 +130,22 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) return -ENOSPC; } -static u32 cr4 = 0; -static u32 ccr3; +static u32 cr4, ccr3; static void prepare_set(void) { u32 cr0; /* Save value of CR4 and clear Page Global Enable (bit 7) */ - if ( cpu_has_pge ) { + if (cpu_has_pge) { cr4 = read_cr4(); write_cr4(cr4 & ~X86_CR4_PGE); } - /* Disable and flush caches. Note that wbinvd flushes the TLBs as - a side-effect */ + /* + * Disable and flush caches. + * Note that wbinvd flushes the TLBs as a side-effect + */ cr0 = read_cr0() | X86_CR0_CD; wbinvd(); write_cr0(cr0); @@ -147,22 +156,21 @@ static void prepare_set(void) /* Cyrix ARRs - everything else was excluded at the top */ setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); - } static void post_set(void) { - /* Flush caches and TLBs */ + /* Flush caches and TLBs */ wbinvd(); /* Cyrix ARRs - everything else was excluded at the top */ setCx86(CX86_CCR3, ccr3); - - /* Enable caches */ + + /* Enable caches */ write_cr0(read_cr0() & 0xbfffffff); - /* Restore value of CR4 */ - if ( cpu_has_pge ) + /* Restore value of CR4 */ + if (cpu_has_pge) write_cr4(cr4); } @@ -178,7 +186,8 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base, size >>= 6; size &= 0x7fff; /* make sure arr_size <= 14 */ - for (arr_size = 0; size; arr_size++, size >>= 1) ; + for (arr_size = 0; size; arr_size++, size >>= 1) + ; if (reg < 7) { switch (type) { @@ -215,18 +224,18 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base, prepare_set(); base <<= PAGE_SHIFT; - setCx86(arr, ((unsigned char *) &base)[3]); - setCx86(arr + 1, ((unsigned char *) &base)[2]); - setCx86(arr + 2, (((unsigned char *) &base)[1]) | arr_size); + setCx86(arr + 0, ((unsigned char *)&base)[3]); + setCx86(arr + 1, ((unsigned char *)&base)[2]); + setCx86(arr + 2, (((unsigned char *)&base)[1]) | arr_size); setCx86(CX86_RCR_BASE + reg, arr_type); post_set(); } typedef struct { - unsigned long base; - unsigned long size; - mtrr_type type; + unsigned long base; + unsigned long size; + mtrr_type type; } arr_state_t; static arr_state_t arr_state[8] = { @@ -247,16 +256,17 @@ static void cyrix_set_all(void) setCx86(CX86_CCR0 + i, ccr_state[i]); for (; i < 7; i++) setCx86(CX86_CCR4 + i, ccr_state[i]); - for (i = 0; i < 8; i++) - cyrix_set_arr(i, arr_state[i].base, + + for (i = 0; i < 8; i++) { + cyrix_set_arr(i, arr_state[i].base, arr_state[i].size, arr_state[i].type); + } post_set(); } static struct mtrr_ops cyrix_mtrr_ops = { .vendor = X86_VENDOR_CYRIX, -// .init = cyrix_arr_init, .set_all = cyrix_set_all, .set = cyrix_set_arr, .get = cyrix_get_arr, @@ -270,5 +280,3 @@ int __init cyrix_init_mtrr(void) set_mtrr_ops(&cyrix_mtrr_ops); return 0; } - -//arch_initcall(cyrix_init_mtrr); -- cgit v1.2.3 From a1a499a39911fcfecbebaba1f38588088909f918 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 4 Jul 2009 07:53:00 +0530 Subject: x86: Clean up mtrr/generic.c Fix following trivial style problems: ERROR: trailing whitespace X 4 WARNING: Use #include instead of WARNING: braces {} are not necessary for single statement blocks X 3 ERROR: "foo * bar" should be "foo *bar" WARNING: line over 80 characters X 6 ERROR: "foo * bar" should be "foo *bar" ERROR: spaces required around that '=' (ctx:VxO) ERROR: space required before that '-' (ctx:OxV) WARNING: suspect code indent for conditional statements (8, 12) ERROR: spaces required around that '=' (ctx:VxV) ERROR: do not initialise statics to 0 or NULL ERROR: space prohibited after that open parenthesis '(' X 2 ERROR: space prohibited before that close parenthesis ')' X 2 ERROR: trailing statements should be on next line ERROR: return is not a function, parentheses are not required Also use pr_debug and pr_warning where possible. arch/x86/kernel/cpu/mtrr/generic.o: text data bss dec hex filename 5652 77 4224 9953 26e1 generic.o.before 5652 77 4220 9949 26dd generic.o.after The md5 changed: b34d6c045f06daa4ed092b90cc760e8f generic.o.before.asm a490c6251cfd8442fbffecc0e09a573d generic.o.after.asm Because mtrr_state moved from data to bss, changing its offsets - and also because __LINE__ numbers changed. Suggested-by: Alan Cox Signed-off-by: Jaswinder Singh Rajput Cc: Andrew Morton Cc: Yinghai Lu LKML-Reference: <20090703164225.GA21447@elte.hu> [ Further cleanups to make the code more consistent ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 304 +++++++++++++++++++++---------------- 1 file changed, 169 insertions(+), 135 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 0543f69f0b27..55da0c5f68dd 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -1,28 +1,34 @@ -/* This only handles 32bit MTRR on 32bit hosts. This is strictly wrong - because MTRRs can span upto 40 bits (36bits on most modern x86) */ +/* + * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong + * because MTRRs can span upto 40 bits (36bits on most modern x86) + */ +#define DEBUG + +#include #include #include +#include #include -#include -#include -#include -#include -#include -#include + #include +#include #include +#include +#include +#include #include + #include "mtrr.h" struct fixed_range_block { - int base_msr; /* start address of an MTRR block */ - int ranges; /* number of MTRRs in this block */ + int base_msr; /* start address of an MTRR block */ + int ranges; /* number of MTRRs in this block */ }; static struct fixed_range_block fixed_range_blocks[] = { - { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ - { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ - { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ + { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ + { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ + { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ {} }; @@ -30,10 +36,10 @@ static unsigned long smp_changes_mask; static int mtrr_state_set; u64 mtrr_tom2; -struct mtrr_state_type mtrr_state = {}; +struct mtrr_state_type mtrr_state; EXPORT_SYMBOL_GPL(mtrr_state); -/** +/* * BIOS is expected to clear MtrrFixDramModEn bit, see for example * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD * Opteron Processors" (26094 Rev. 3.30 February 2006), section @@ -104,9 +110,8 @@ u8 mtrr_type_lookup(u64 start, u64 end) * Look of multiple ranges matching this address and pick type * as per MTRR precedence */ - if (!(mtrr_state.enabled & 2)) { + if (!(mtrr_state.enabled & 2)) return mtrr_state.def_type; - } prev_match = 0xFF; for (i = 0; i < num_var_ranges; ++i) { @@ -125,9 +130,8 @@ u8 mtrr_type_lookup(u64 start, u64 end) if (start_state != end_state) return 0xFE; - if ((start & mask) != (base & mask)) { + if ((start & mask) != (base & mask)) continue; - } curr_match = mtrr_state.var_ranges[i].base_lo & 0xff; if (prev_match == 0xFF) { @@ -148,9 +152,8 @@ u8 mtrr_type_lookup(u64 start, u64 end) curr_match = MTRR_TYPE_WRTHROUGH; } - if (prev_match != curr_match) { + if (prev_match != curr_match) return MTRR_TYPE_UNCACHABLE; - } } if (mtrr_tom2) { @@ -164,7 +167,7 @@ u8 mtrr_type_lookup(u64 start, u64 end) return mtrr_state.def_type; } -/* Get the MSR pair relating to a var range */ +/* Get the MSR pair relating to a var range */ static void get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) { @@ -172,7 +175,7 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); } -/* fill the MSR pair relating to a var range */ +/* Fill the MSR pair relating to a var range */ void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi) { @@ -186,10 +189,9 @@ void fill_mtrr_var_range(unsigned int index, vr[index].mask_hi = mask_hi; } -static void -get_fixed_ranges(mtrr_type * frs) +static void get_fixed_ranges(mtrr_type *frs) { - unsigned int *p = (unsigned int *) frs; + unsigned int *p = (unsigned int *)frs; int i; k8_check_syscfg_dram_mod_en(); @@ -217,22 +219,22 @@ static void __init print_fixed_last(void) if (!last_fixed_end) return; - printk(KERN_DEBUG " %05X-%05X %s\n", last_fixed_start, - last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type)); + pr_debug(" %05X-%05X %s\n", last_fixed_start, + last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type)); last_fixed_end = 0; } static void __init update_fixed_last(unsigned base, unsigned end, - mtrr_type type) + mtrr_type type) { last_fixed_start = base; last_fixed_end = end; last_fixed_type = type; } -static void __init print_fixed(unsigned base, unsigned step, - const mtrr_type *types) +static void __init +print_fixed(unsigned base, unsigned step, const mtrr_type *types) { unsigned i; @@ -259,54 +261,55 @@ static void __init print_mtrr_state(void) unsigned int i; int high_width; - printk(KERN_DEBUG "MTRR default type: %s\n", - mtrr_attrib_to_str(mtrr_state.def_type)); + pr_debug("MTRR default type: %s\n", + mtrr_attrib_to_str(mtrr_state.def_type)); if (mtrr_state.have_fixed) { - printk(KERN_DEBUG "MTRR fixed ranges %sabled:\n", - mtrr_state.enabled & 1 ? "en" : "dis"); + pr_debug("MTRR fixed ranges %sabled:\n", + mtrr_state.enabled & 1 ? "en" : "dis"); print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); for (i = 0; i < 2; ++i) - print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8); + print_fixed(0x80000 + i * 0x20000, 0x04000, + mtrr_state.fixed_ranges + (i + 1) * 8); for (i = 0; i < 8; ++i) - print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8); + print_fixed(0xC0000 + i * 0x08000, 0x01000, + mtrr_state.fixed_ranges + (i + 3) * 8); /* tail */ print_fixed_last(); } - printk(KERN_DEBUG "MTRR variable ranges %sabled:\n", - mtrr_state.enabled & 2 ? "en" : "dis"); + pr_debug("MTRR variable ranges %sabled:\n", + mtrr_state.enabled & 2 ? "en" : "dis"); if (size_or_mask & 0xffffffffUL) high_width = ffs(size_or_mask & 0xffffffffUL) - 1; else high_width = ffs(size_or_mask>>32) + 32 - 1; high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4; + for (i = 0; i < num_var_ranges; ++i) { if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) - printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n", - i, - high_width, - mtrr_state.var_ranges[i].base_hi, - mtrr_state.var_ranges[i].base_lo >> 12, - high_width, - mtrr_state.var_ranges[i].mask_hi, - mtrr_state.var_ranges[i].mask_lo >> 12, - mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); + pr_debug(" %u base %0*X%05X000 mask %0*X%05X000 %s\n", + i, + high_width, + mtrr_state.var_ranges[i].base_hi, + mtrr_state.var_ranges[i].base_lo >> 12, + high_width, + mtrr_state.var_ranges[i].mask_hi, + mtrr_state.var_ranges[i].mask_lo >> 12, + mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); else - printk(KERN_DEBUG " %u disabled\n", i); - } - if (mtrr_tom2) { - printk(KERN_DEBUG "TOM2: %016llx aka %lldM\n", - mtrr_tom2, mtrr_tom2>>20); + pr_debug(" %u disabled\n", i); } + if (mtrr_tom2) + pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20); } -/* Grab all of the MTRR state for this CPU into *state */ +/* Grab all of the MTRR state for this CPU into *state */ void __init get_mtrr_state(void) { - unsigned int i; struct mtrr_var_range *vrs; - unsigned lo, dummy; unsigned long flags; + unsigned lo, dummy; + unsigned int i; vrs = mtrr_state.var_ranges; @@ -324,6 +327,7 @@ void __init get_mtrr_state(void) if (amd_special_default_mtrr()) { unsigned low, high; + /* TOP_MEM2 */ rdmsr(MSR_K8_TOP_MEM2, low, high); mtrr_tom2 = high; @@ -344,10 +348,9 @@ void __init get_mtrr_state(void) post_set(); local_irq_restore(flags); - } -/* Some BIOS's are fucked and don't set all MTRRs the same! */ +/* Some BIOS's are messed up and don't set all MTRRs the same! */ void __init mtrr_state_warn(void) { unsigned long mask = smp_changes_mask; @@ -355,28 +358,33 @@ void __init mtrr_state_warn(void) if (!mask) return; if (mask & MTRR_CHANGE_MASK_FIXED) - printk(KERN_WARNING "mtrr: your CPUs had inconsistent fixed MTRR settings\n"); + pr_warning("mtrr: your CPUs had inconsistent fixed MTRR settings\n"); if (mask & MTRR_CHANGE_MASK_VARIABLE) - printk(KERN_WARNING "mtrr: your CPUs had inconsistent variable MTRR settings\n"); + pr_warning("mtrr: your CPUs had inconsistent variable MTRR settings\n"); if (mask & MTRR_CHANGE_MASK_DEFTYPE) - printk(KERN_WARNING "mtrr: your CPUs had inconsistent MTRRdefType settings\n"); + pr_warning("mtrr: your CPUs had inconsistent MTRRdefType settings\n"); + printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n"); printk(KERN_INFO "mtrr: corrected configuration.\n"); } -/* Doesn't attempt to pass an error out to MTRR users - because it's quite complicated in some cases and probably not - worth it because the best error handling is to ignore it. */ +/* + * Doesn't attempt to pass an error out to MTRR users + * because it's quite complicated in some cases and probably not + * worth it because the best error handling is to ignore it. + */ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b) { - if (wrmsr_safe(msr, a, b) < 0) + if (wrmsr_safe(msr, a, b) < 0) { printk(KERN_ERR "MTRR: CPU %u: Writing MSR %x to %x:%x failed\n", smp_processor_id(), msr, a, b); + } } /** - * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have + * set_fixed_range - checks & updates a fixed-range MTRR if it + * differs from the value it should have * @msr: MSR address of the MTTR which should be checked and updated * @changed: pointer which indicates whether the MTRR needed to be changed * @msrwords: pointer to the MSR values which the MSR should have @@ -401,20 +409,23 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) * * Returns: The index of the region on success, else negative on error. */ -int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) +int +generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) { - int i, max; - mtrr_type ltype; unsigned long lbase, lsize; + mtrr_type ltype; + int i, max; max = num_var_ranges; if (replace_reg >= 0 && replace_reg < max) return replace_reg; + for (i = 0; i < max; ++i) { mtrr_if->get(i, &lbase, &lsize, <ype); if (lsize == 0) return i; } + return -ENOSPC; } @@ -434,7 +445,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); if ((mask_lo & 0x800) == 0) { - /* Invalid (i.e. free) range */ + /* Invalid (i.e. free) range */ *base = 0; *size = 0; *type = 0; @@ -471,27 +482,31 @@ out_put_cpu: } /** - * set_fixed_ranges - checks & updates the fixed-range MTRRs if they differ from the saved set + * set_fixed_ranges - checks & updates the fixed-range MTRRs if they + * differ from the saved set * @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges() */ -static int set_fixed_ranges(mtrr_type * frs) +static int set_fixed_ranges(mtrr_type *frs) { - unsigned long long *saved = (unsigned long long *) frs; + unsigned long long *saved = (unsigned long long *)frs; bool changed = false; - int block=-1, range; + int block = -1, range; k8_check_syscfg_dram_mod_en(); - while (fixed_range_blocks[++block].ranges) - for (range=0; range < fixed_range_blocks[block].ranges; range++) - set_fixed_range(fixed_range_blocks[block].base_msr + range, - &changed, (unsigned int *) saved++); + while (fixed_range_blocks[++block].ranges) { + for (range = 0; range < fixed_range_blocks[block].ranges; range++) + set_fixed_range(fixed_range_blocks[block].base_msr + range, + &changed, (unsigned int *)saved++); + } return changed; } -/* Set the MSR pair relating to a var range. Returns TRUE if - changes are made */ +/* + * Set the MSR pair relating to a var range. + * Returns true if changes are made. + */ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) { unsigned int lo, hi; @@ -501,6 +516,7 @@ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL) || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { + mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); changed = true; } @@ -526,21 +542,26 @@ static u32 deftype_lo, deftype_hi; */ static unsigned long set_mtrr_state(void) { - unsigned int i; unsigned long change_mask = 0; + unsigned int i; - for (i = 0; i < num_var_ranges; i++) + for (i = 0; i < num_var_ranges; i++) { if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i])) change_mask |= MTRR_CHANGE_MASK_VARIABLE; + } if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges)) change_mask |= MTRR_CHANGE_MASK_FIXED; - /* Set_mtrr_restore restores the old value of MTRRdefType, - so to set it we fiddle with the saved value */ + /* + * Set_mtrr_restore restores the old value of MTRRdefType, + * so to set it we fiddle with the saved value: + */ if ((deftype_lo & 0xff) != mtrr_state.def_type || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) { - deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | (mtrr_state.enabled << 10); + + deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | + (mtrr_state.enabled << 10); change_mask |= MTRR_CHANGE_MASK_DEFTYPE; } @@ -548,33 +569,36 @@ static unsigned long set_mtrr_state(void) } -static unsigned long cr4 = 0; +static unsigned long cr4; static DEFINE_SPINLOCK(set_atomicity_lock); /* - * Since we are disabling the cache don't allow any interrupts - they - * would run extremely slow and would only increase the pain. The caller must - * ensure that local interrupts are disabled and are reenabled after post_set() - * has been called. + * Since we are disabling the cache don't allow any interrupts, + * they would run extremely slow and would only increase the pain. + * + * The caller must ensure that local interrupts are disabled and + * are reenabled after post_set() has been called. */ - static void prepare_set(void) __acquires(set_atomicity_lock) { unsigned long cr0; - /* Note that this is not ideal, since the cache is only flushed/disabled - for this CPU while the MTRRs are changed, but changing this requires - more invasive changes to the way the kernel boots */ + /* + * Note that this is not ideal + * since the cache is only flushed/disabled for this CPU while the + * MTRRs are changed, but changing this requires more invasive + * changes to the way the kernel boots + */ spin_lock(&set_atomicity_lock); - /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ + /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ cr0 = read_cr0() | X86_CR0_CD; write_cr0(cr0); wbinvd(); - /* Save value of CR4 and clear Page Global Enable (bit 7) */ - if ( cpu_has_pge ) { + /* Save value of CR4 and clear Page Global Enable (bit 7) */ + if (cpu_has_pge) { cr4 = read_cr4(); write_cr4(cr4 & ~X86_CR4_PGE); } @@ -582,26 +606,26 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ __flush_tlb(); - /* Save MTRR state */ + /* Save MTRR state */ rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); - /* Disable MTRRs, and set the default type to uncached */ + /* Disable MTRRs, and set the default type to uncached */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); } static void post_set(void) __releases(set_atomicity_lock) { - /* Flush TLBs (no need to flush caches - they are disabled) */ + /* Flush TLBs (no need to flush caches - they are disabled) */ __flush_tlb(); /* Intel (P6) standard MTRRs */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); - - /* Enable caches */ + + /* Enable caches */ write_cr0(read_cr0() & 0xbfffffff); - /* Restore value of CR4 */ - if ( cpu_has_pge ) + /* Restore value of CR4 */ + if (cpu_has_pge) write_cr4(cr4); spin_unlock(&set_atomicity_lock); } @@ -623,24 +647,27 @@ static void generic_set_all(void) post_set(); local_irq_restore(flags); - /* Use the atomic bitops to update the global mask */ + /* Use the atomic bitops to update the global mask */ for (count = 0; count < sizeof mask * 8; ++count) { if (mask & 0x01) set_bit(count, &smp_changes_mask); mask >>= 1; } - + } +/** + * generic_set_mtrr - set variable MTRR register on the local CPU. + * + * @reg: The register to set. + * @base: The base address of the region. + * @size: The size of the region. If this is 0 the region is disabled. + * @type: The type of the region. + * + * Returns nothing. + */ static void generic_set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) -/* [SUMMARY] Set variable MTRR register on the local CPU. - The register to set. - The base address of the region. - The size of the region. If this is 0 the region is disabled. - The type of the region. - [RETURNS] Nothing. -*/ { unsigned long flags; struct mtrr_var_range *vr; @@ -651,8 +678,10 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, prepare_set(); if (size == 0) { - /* The invalid bit is kept in the mask, so we simply clear the - relevant mask register to disable a range. */ + /* + * The invalid bit is kept in the mask, so we simply + * clear the relevant mask register to disable a range. + */ mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0); memset(vr, 0, sizeof(struct mtrr_var_range)); } else { @@ -669,46 +698,50 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, local_irq_restore(flags); } -int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type) +int generic_validate_add_page(unsigned long base, unsigned long size, + unsigned int type) { unsigned long lbase, last; - /* For Intel PPro stepping <= 7, must be 4 MiB aligned - and not touch 0x70000000->0x7003FFFF */ + /* + * For Intel PPro stepping <= 7 + * must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF + */ if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 1 && boot_cpu_data.x86_mask <= 7) { if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { - printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); + pr_warning("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); return -EINVAL; } if (!(base + size < 0x70000 || base > 0x7003F) && (type == MTRR_TYPE_WRCOMB || type == MTRR_TYPE_WRBACK)) { - printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); + pr_warning("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); return -EINVAL; } } - /* Check upper bits of base and last are equal and lower bits are 0 - for base and 1 for last */ + /* + * Check upper bits of base and last are equal and lower bits are 0 + * for base and 1 for last + */ last = base + size - 1; for (lbase = base; !(lbase & 1) && (last & 1); - lbase = lbase >> 1, last = last >> 1) ; + lbase = lbase >> 1, last = last >> 1) + ; if (lbase != last) { - printk(KERN_WARNING "mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", - base, size); + pr_warning("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size); return -EINVAL; } return 0; } - static int generic_have_wrcomb(void) { unsigned long config, dummy; rdmsr(MSR_MTRRcap, config, dummy); - return (config & (1 << 10)); + return config & (1 << 10); } int positive_have_wrcomb(void) @@ -716,14 +749,15 @@ int positive_have_wrcomb(void) return 1; } -/* generic structure... +/* + * Generic structure... */ struct mtrr_ops generic_mtrr_ops = { - .use_intel_if = 1, - .set_all = generic_set_all, - .get = generic_get_mtrr, - .get_free_region = generic_get_free_region, - .set = generic_set_mtrr, - .validate_add_page = generic_validate_add_page, - .have_wrcomb = generic_have_wrcomb, + .use_intel_if = 1, + .set_all = generic_set_all, + .get = generic_get_mtrr, + .get_free_region = generic_get_free_region, + .set = generic_set_mtrr, + .validate_add_page = generic_validate_add_page, + .have_wrcomb = generic_have_wrcomb, }; -- cgit v1.2.3 From 26dc67eda19beafb7e5ef2770cec5b3ee5995a8e Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 4 Jul 2009 07:53:40 +0530 Subject: x86: Clean up mtrr/if.c Fix: WARNING: Use #include instead of ERROR: trailing whitespace X 7 ERROR: trailing statements should be on next line X 3 WARNING: line over 80 characters X 5 ERROR: space required before the open parenthesis '(' arch/x86/kernel/cpu/mtrr/if.o: text data bss dec hex filename 2239 4 0 2243 8c3 if.o.before 2239 4 0 2243 8c3 if.o.after md5: 78d1f2aa4843ec6509c18e2dee54bc7f if.o.before.asm 78d1f2aa4843ec6509c18e2dee54bc7f if.o.after.asm Suggested-by: Alan Cox Signed-off-by: Jaswinder Singh Rajput Cc: Andrew Morton Cc: Yinghai Lu LKML-Reference: <20090703164225.GA21447@elte.hu> [ More cleanups to make the code more consistent. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/if.c | 135 ++++++++++++++++++++++++------------------ 1 file changed, 76 insertions(+), 59 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index fb73a52913a4..08b6ea4c62b4 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -1,27 +1,28 @@ -#include -#include #include -#include -#include #include -#include +#include +#include +#include +#include +#include #define LINE_SIZE 80 #include + #include "mtrr.h" #define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) static const char *const mtrr_strings[MTRR_NUM_TYPES] = { - "uncachable", /* 0 */ - "write-combining", /* 1 */ - "?", /* 2 */ - "?", /* 3 */ - "write-through", /* 4 */ - "write-protect", /* 5 */ - "write-back", /* 6 */ + "uncachable", /* 0 */ + "write-combining", /* 1 */ + "?", /* 2 */ + "?", /* 3 */ + "write-through", /* 4 */ + "write-protect", /* 5 */ + "write-back", /* 6 */ }; const char *mtrr_attrib_to_str(int x) @@ -35,8 +36,8 @@ static int mtrr_file_add(unsigned long base, unsigned long size, unsigned int type, bool increment, struct file *file, int page) { + unsigned int *fcount = FILE_FCOUNT(file); int reg, max; - unsigned int *fcount = FILE_FCOUNT(file); max = num_var_ranges; if (fcount == NULL) { @@ -61,8 +62,8 @@ static int mtrr_file_del(unsigned long base, unsigned long size, struct file *file, int page) { - int reg; unsigned int *fcount = FILE_FCOUNT(file); + int reg; if (!page) { if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) @@ -81,13 +82,14 @@ mtrr_file_del(unsigned long base, unsigned long size, return reg; } -/* RED-PEN: seq_file can seek now. this is ignored. */ +/* + * seq_file can seek but we ignore it. + * + * Format of control line: + * "base=%Lx size=%Lx type=%s" or "disable=%d" + */ static ssize_t mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) -/* Format of control line: - "base=%Lx size=%Lx type=%s" OR: - "disable=%d" -*/ { int i, err; unsigned long reg; @@ -100,15 +102,18 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) return -EPERM; if (!len) return -EINVAL; + memset(line, 0, LINE_SIZE); if (len > LINE_SIZE) len = LINE_SIZE; if (copy_from_user(line, buf, len - 1)) return -EFAULT; + linelen = strlen(line); ptr = line + linelen - 1; if (linelen && *ptr == '\n') *ptr = '\0'; + if (!strncmp(line, "disable=", 8)) { reg = simple_strtoul(line + 8, &ptr, 0); err = mtrr_del_page(reg, 0, 0); @@ -116,28 +121,35 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) return err; return len; } + if (strncmp(line, "base=", 5)) return -EINVAL; + base = simple_strtoull(line + 5, &ptr, 0); - for (; isspace(*ptr); ++ptr) ; + for (; isspace(*ptr); ++ptr) + ; + if (strncmp(ptr, "size=", 5)) return -EINVAL; + size = simple_strtoull(ptr + 5, &ptr, 0); if ((base & 0xfff) || (size & 0xfff)) return -EINVAL; - for (; isspace(*ptr); ++ptr) ; + for (; isspace(*ptr); ++ptr) + ; + if (strncmp(ptr, "type=", 5)) return -EINVAL; ptr += 5; - for (; isspace(*ptr); ++ptr) ; + for (; isspace(*ptr); ++ptr) + ; + for (i = 0; i < MTRR_NUM_TYPES; ++i) { if (strcmp(ptr, mtrr_strings[i])) continue; base >>= PAGE_SHIFT; size >>= PAGE_SHIFT; - err = - mtrr_add_page((unsigned long) base, (unsigned long) size, i, - true); + err = mtrr_add_page((unsigned long)base, (unsigned long)size, i, true); if (err < 0) return err; return len; @@ -181,7 +193,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) case MTRRIOC32_SET_PAGE_ENTRY: case MTRRIOC32_DEL_PAGE_ENTRY: case MTRRIOC32_KILL_PAGE_ENTRY: { - struct mtrr_sentry32 __user *s32 = (struct mtrr_sentry32 __user *)__arg; + struct mtrr_sentry32 __user *s32; + + s32 = (struct mtrr_sentry32 __user *)__arg; err = get_user(sentry.base, &s32->base); err |= get_user(sentry.size, &s32->size); err |= get_user(sentry.type, &s32->type); @@ -191,7 +205,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) } case MTRRIOC32_GET_ENTRY: case MTRRIOC32_GET_PAGE_ENTRY: { - struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)__arg; + struct mtrr_gentry32 __user *g32; + + g32 = (struct mtrr_gentry32 __user *)__arg; err = get_user(gentry.regnum, &g32->regnum); err |= get_user(gentry.base, &g32->base); err |= get_user(gentry.size, &g32->size); @@ -314,7 +330,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) if (err) return err; - switch(cmd) { + switch (cmd) { case MTRRIOC_GET_ENTRY: case MTRRIOC_GET_PAGE_ENTRY: if (copy_to_user(arg, &gentry, sizeof gentry)) @@ -323,7 +339,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) #ifdef CONFIG_COMPAT case MTRRIOC32_GET_ENTRY: case MTRRIOC32_GET_PAGE_ENTRY: { - struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)__arg; + struct mtrr_gentry32 __user *g32; + + g32 = (struct mtrr_gentry32 __user *)__arg; err = put_user(gentry.base, &g32->base); err |= put_user(gentry.size, &g32->size); err |= put_user(gentry.regnum, &g32->regnum); @@ -335,11 +353,10 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) return err; } -static int -mtrr_close(struct inode *ino, struct file *file) +static int mtrr_close(struct inode *ino, struct file *file) { - int i, max; unsigned int *fcount = FILE_FCOUNT(file); + int i, max; if (fcount != NULL) { max = num_var_ranges; @@ -359,22 +376,22 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset); static int mtrr_open(struct inode *inode, struct file *file) { - if (!mtrr_if) + if (!mtrr_if) return -EIO; - if (!mtrr_if->get) - return -ENXIO; + if (!mtrr_if->get) + return -ENXIO; return single_open(file, mtrr_seq_show, NULL); } static const struct file_operations mtrr_fops = { - .owner = THIS_MODULE, - .open = mtrr_open, - .read = seq_read, - .llseek = seq_lseek, - .write = mtrr_write, - .unlocked_ioctl = mtrr_ioctl, - .compat_ioctl = mtrr_ioctl, - .release = mtrr_close, + .owner = THIS_MODULE, + .open = mtrr_open, + .read = seq_read, + .llseek = seq_lseek, + .write = mtrr_write, + .unlocked_ioctl = mtrr_ioctl, + .compat_ioctl = mtrr_ioctl, + .release = mtrr_close, }; static int mtrr_seq_show(struct seq_file *seq, void *offset) @@ -388,23 +405,24 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset) max = num_var_ranges; for (i = 0; i < max; i++) { mtrr_if->get(i, &base, &size, &type); - if (size == 0) + if (size == 0) { mtrr_usage_table[i] = 0; - else { - if (size < (0x100000 >> PAGE_SHIFT)) { - /* less than 1MB */ - factor = 'K'; - size <<= PAGE_SHIFT - 10; - } else { - factor = 'M'; - size >>= 20 - PAGE_SHIFT; - } - /* RED-PEN: base can be > 32bit */ - len += seq_printf(seq, - "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n", - i, base, base >> (20 - PAGE_SHIFT), size, factor, - mtrr_usage_table[i], mtrr_attrib_to_str(type)); + continue; } + if (size < (0x100000 >> PAGE_SHIFT)) { + /* less than 1MB */ + factor = 'K'; + size <<= PAGE_SHIFT - 10; + } else { + factor = 'M'; + size >>= 20 - PAGE_SHIFT; + } + /* Base can be > 32bit */ + len += seq_printf(seq, "reg%02i: base=0x%06lx000 " + "(%5luMB), size=%5lu%cB, count=%d: %s\n", + i, base, base >> (20 - PAGE_SHIFT), size, + factor, mtrr_usage_table[i], + mtrr_attrib_to_str(type)); } return 0; } @@ -422,6 +440,5 @@ static int __init mtrr_if_init(void) proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops); return 0; } - arch_initcall(mtrr_if_init); #endif /* CONFIG_PROC_FS */ -- cgit v1.2.3 From 3ec8dbcb09bb6df83993ca03e88cb85e3aaa8edb Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 4 Jul 2009 07:54:16 +0530 Subject: x86: Clean up mtrr/mtrr.h Fix: ERROR: do not use C99 // comments ERROR: "foo * bar" should be "foo *bar" X 2 Suggested-by: Alan Cox Signed-off-by: Jaswinder Singh Rajput Cc: Andrew Morton Cc: Yinghai Lu LKML-Reference: <20090703164225.GA21447@elte.hu> [ More tidyups ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/mtrr.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 7538b767f206..a501dee9a87a 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -1,5 +1,5 @@ /* - * local mtrr defines. + * local MTRR defines. */ #include @@ -14,13 +14,12 @@ extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; struct mtrr_ops { u32 vendor; u32 use_intel_if; -// void (*init)(void); void (*set)(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); void (*set_all)(void); void (*get)(unsigned int reg, unsigned long *base, - unsigned long *size, mtrr_type * type); + unsigned long *size, mtrr_type *type); int (*get_free_region)(unsigned long base, unsigned long size, int replace_reg); int (*validate_add_page)(unsigned long base, unsigned long size, @@ -39,11 +38,11 @@ extern int positive_have_wrcomb(void); /* library functions for processor-specific routines */ struct set_mtrr_context { - unsigned long flags; - unsigned long cr4val; - u32 deftype_lo; - u32 deftype_hi; - u32 ccr3; + unsigned long flags; + unsigned long cr4val; + u32 deftype_lo; + u32 deftype_hi; + u32 ccr3; }; void set_mtrr_done(struct set_mtrr_context *ctxt); @@ -54,10 +53,10 @@ void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); void get_mtrr_state(void); -extern void set_mtrr_ops(struct mtrr_ops * ops); +extern void set_mtrr_ops(struct mtrr_ops *ops); extern u64 size_or_mask, size_and_mask; -extern struct mtrr_ops * mtrr_if; +extern struct mtrr_ops *mtrr_if; #define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) #define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) -- cgit v1.2.3 From 09b22c85d59dd935fdfa71655a443785e3f99c18 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 4 Jul 2009 07:54:53 +0530 Subject: x86: Clean up mtrr/state.c Fix: WARNING: Use #include instead of WARNING: line over 80 characters X 4 arch/x86/kernel/cpu/mtrr/state.o: text data bss dec hex filename 864 0 0 864 360 state.o.before 864 0 0 864 360 state.o.after md5: c5c4364b9aeac74d70111e1e49667a2c state.o.before.asm c5c4364b9aeac74d70111e1e49667a2c state.o.after.asm Suggested-by: Alan Cox Signed-off-by: Jaswinder Singh Rajput Cc: Andrew Morton Cc: Yinghai Lu LKML-Reference: <20090703164225.GA21447@elte.hu> [ More cleanups ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/state.c | 68 +++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c index 1f5fb1588d1f..dfc80b4e6b0d 100644 --- a/arch/x86/kernel/cpu/mtrr/state.c +++ b/arch/x86/kernel/cpu/mtrr/state.c @@ -1,24 +1,25 @@ -#include #include -#include -#include -#include +#include +#include + #include #include -#include "mtrr.h" +#include +#include +#include "mtrr.h" -/* Put the processor into a state where MTRRs can be safely set */ +/* Put the processor into a state where MTRRs can be safely set */ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) { unsigned int cr0; - /* Disable interrupts locally */ + /* Disable interrupts locally */ local_irq_save(ctxt->flags); if (use_intel() || is_cpu(CYRIX)) { - /* Save value of CR4 and clear Page Global Enable (bit 7) */ + /* Save value of CR4 and clear Page Global Enable (bit 7) */ if (cpu_has_pge) { ctxt->cr4val = read_cr4(); write_cr4(ctxt->cr4val & ~X86_CR4_PGE); @@ -33,50 +34,61 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) write_cr0(cr0); wbinvd(); - if (use_intel()) - /* Save MTRR state */ + if (use_intel()) { + /* Save MTRR state */ rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); - else - /* Cyrix ARRs - everything else were excluded at the top */ + } else { + /* + * Cyrix ARRs - + * everything else were excluded at the top + */ ctxt->ccr3 = getCx86(CX86_CCR3); + } } } void set_mtrr_cache_disable(struct set_mtrr_context *ctxt) { - if (use_intel()) - /* Disable MTRRs, and set the default type to uncached */ + if (use_intel()) { + /* Disable MTRRs, and set the default type to uncached */ mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL, ctxt->deftype_hi); - else if (is_cpu(CYRIX)) - /* Cyrix ARRs - everything else were excluded at the top */ - setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10); + } else { + if (is_cpu(CYRIX)) { + /* Cyrix ARRs - everything else were excluded at the top */ + setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10); + } + } } -/* Restore the processor after a set_mtrr_prepare */ +/* Restore the processor after a set_mtrr_prepare */ void set_mtrr_done(struct set_mtrr_context *ctxt) { if (use_intel() || is_cpu(CYRIX)) { - /* Flush caches and TLBs */ + /* Flush caches and TLBs */ wbinvd(); - /* Restore MTRRdefType */ - if (use_intel()) + /* Restore MTRRdefType */ + if (use_intel()) { /* Intel (P6) standard MTRRs */ - mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); - else - /* Cyrix ARRs - everything else was excluded at the top */ + mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, + ctxt->deftype_hi); + } else { + /* + * Cyrix ARRs - + * everything else was excluded at the top + */ setCx86(CX86_CCR3, ctxt->ccr3); + } - /* Enable caches */ + /* Enable caches */ write_cr0(read_cr0() & 0xbfffffff); - /* Restore value of CR4 */ + /* Restore value of CR4 */ if (cpu_has_pge) write_cr4(ctxt->cr4val); } - /* Re-enable interrupts locally (if enabled previously) */ + /* Re-enable interrupts locally (if enabled previously) */ local_irq_restore(ctxt->flags); } - -- cgit v1.2.3 From dbd51be026eaf84088fdee7fab9f38fa92eef26d Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 4 Jul 2009 07:56:28 +0530 Subject: x86: Clean up mtrr/main.c Fix following trivial style problems: ERROR: trailing whitespace X 25 WARNING: Use #include instead of WARNING: Use #include instead of ERROR: do not initialise externals to 0 or NULL X 2 ERROR: "foo * bar" should be "foo *bar" X 5 ERROR: do not use assignment in if condition X 2 WARNING: line over 80 characters X 8 ERROR: return is not a function, parentheses are not required WARNING: braces {} are not necessary for any arm of this statement ERROR: space required before the open parenthesis '(' X 2 ERROR: open brace '{' following function declarations go on the next line ERROR: space required after that ',' (ctx:VxV) X 8 ERROR: space required before the open parenthesis '(' X 3 ERROR: else should follow close brace '}' WARNING: space prohibited between function name and open parenthesis '(' WARNING: EXPORT_SYMBOL(foo); should immediately follow its function/variable X 2 Also use pr_debug and pr_warning where possible. total: 50 errors, 14 warnings arch/x86/kernel/cpu/mtrr/main.o: text data bss dec hex filename 3668 116 4156 7940 1f04 main.o.before 3668 116 4156 7940 1f04 main.o.after md5: e01af2fd28deef77c8d01e71acfbd365 main.o.before.asm e01af2fd28deef77c8d01e71acfbd365 main.o.after.asm Suggested-by: Alan Cox Signed-off-by: Jaswinder Singh Rajput Cc: Andrew Morton Cc: Yinghai Lu LKML-Reference: <20090703164225.GA21447@elte.hu> Cc: Avi Kivity # Avi, please have a look at the kvm_para.h bit [ More cleanups ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 455 +++++++++++++++++++++------------------- 1 file changed, 242 insertions(+), 213 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 8fc248b5aeaf..7af0f88a4163 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -25,43 +25,48 @@ Operating System Writer's Guide" (Intel document number 242692), section 11.11.7 - This was cleaned and made readable by Patrick Mochel - on 6-7 March 2002. - Source: Intel Architecture Software Developers Manual, Volume 3: + This was cleaned and made readable by Patrick Mochel + on 6-7 March 2002. + Source: Intel Architecture Software Developers Manual, Volume 3: System Programming Guide; Section 9.11. (1997 edition - PPro). */ +#define DEBUG + +#include /* FIXME: kvm_para.h needs this */ + +#include +#include #include +#include #include +#include +#include #include #include -#include -#include -#include +#include #include #include -#include -#include #include -#include + #include "mtrr.h" -u32 num_var_ranges = 0; +u32 num_var_ranges; unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; -static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {}; +static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; -struct mtrr_ops * mtrr_if = NULL; +struct mtrr_ops *mtrr_if; static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); -void set_mtrr_ops(struct mtrr_ops * ops) +void set_mtrr_ops(struct mtrr_ops *ops) { if (ops->vendor && ops->vendor < X86_VENDOR_NUM) mtrr_ops[ops->vendor] = ops; @@ -72,30 +77,36 @@ static int have_wrcomb(void) { struct pci_dev *dev; u8 rev; - - if ((dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL)) != NULL) { - /* ServerWorks LE chipsets < rev 6 have problems with write-combining - Don't allow it and leave room for other chipsets to be tagged */ + + dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL); + if (dev != NULL) { + /* + * ServerWorks LE chipsets < rev 6 have problems with + * write-combining. Don't allow it and leave room for other + * chipsets to be tagged + */ if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS && dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) { pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); if (rev <= 5) { - printk(KERN_INFO "mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n"); + pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n"); pci_dev_put(dev); return 0; } } - /* Intel 450NX errata # 23. Non ascending cacheline evictions to - write combining memory may resulting in data corruption */ + /* + * Intel 450NX errata # 23. Non ascending cacheline evictions to + * write combining memory may resulting in data corruption + */ if (dev->vendor == PCI_VENDOR_ID_INTEL && dev->device == PCI_DEVICE_ID_INTEL_82451NX) { - printk(KERN_INFO "mtrr: Intel 450NX MMC detected. Write-combining disabled.\n"); + pr_info("mtrr: Intel 450NX MMC detected. Write-combining disabled.\n"); pci_dev_put(dev); return 0; } pci_dev_put(dev); - } - return (mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0); + } + return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0; } /* This function returns the number of variable MTRRs */ @@ -103,12 +114,13 @@ static void __init set_num_var_ranges(void) { unsigned long config = 0, dummy; - if (use_intel()) { + if (use_intel()) rdmsr(MSR_MTRRcap, config, dummy); - } else if (is_cpu(AMD)) + else if (is_cpu(AMD)) config = 2; else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) config = 8; + num_var_ranges = config & 0xff; } @@ -130,10 +142,12 @@ struct set_mtrr_data { mtrr_type smp_type; }; +/** + * ipi_handler - Synchronisation handler. Executed by "other" CPUs. + * + * Returns nothing. + */ static void ipi_handler(void *info) -/* [SUMMARY] Synchronisation handler. Executed by "other" CPUs. - [RETURNS] Nothing. -*/ { #ifdef CONFIG_SMP struct set_mtrr_data *data = info; @@ -142,18 +156,19 @@ static void ipi_handler(void *info) local_irq_save(flags); atomic_dec(&data->count); - while(!atomic_read(&data->gate)) + while (!atomic_read(&data->gate)) cpu_relax(); /* The master has cleared me to execute */ - if (data->smp_reg != ~0U) - mtrr_if->set(data->smp_reg, data->smp_base, + if (data->smp_reg != ~0U) { + mtrr_if->set(data->smp_reg, data->smp_base, data->smp_size, data->smp_type); - else + } else { mtrr_if->set_all(); + } atomic_dec(&data->count); - while(atomic_read(&data->gate)) + while (atomic_read(&data->gate)) cpu_relax(); atomic_dec(&data->count); @@ -161,7 +176,8 @@ static void ipi_handler(void *info) #endif } -static inline int types_compatible(mtrr_type type1, mtrr_type type2) { +static inline int types_compatible(mtrr_type type1, mtrr_type type2) +{ return type1 == MTRR_TYPE_UNCACHABLE || type2 == MTRR_TYPE_UNCACHABLE || (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) || @@ -176,10 +192,10 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) { * @type: mtrr type * * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: - * + * * 1. Send IPI to do the following: * 2. Disable Interrupts - * 3. Wait for all procs to do so + * 3. Wait for all procs to do so * 4. Enter no-fill cache mode * 5. Flush caches * 6. Clear PGE bit @@ -189,26 +205,27 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) { * 10. Enable all range registers * 11. Flush all TLBs and caches again * 12. Enter normal cache mode and reenable caching - * 13. Set PGE + * 13. Set PGE * 14. Wait for buddies to catch up * 15. Enable interrupts. - * + * * What does that mean for us? Well, first we set data.count to the number * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait * until it hits 0 and proceed. We set the data.gate flag and reset data.count. - * Meanwhile, they are waiting for that flag to be set. Once it's set, each - * CPU goes through the transition of updating MTRRs. The CPU vendors may each do it - * differently, so we call mtrr_if->set() callback and let them take care of it. - * When they're done, they again decrement data->count and wait for data.gate to - * be reset. - * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag. + * Meanwhile, they are waiting for that flag to be set. Once it's set, each + * CPU goes through the transition of updating MTRRs. + * The CPU vendors may each do it differently, + * so we call mtrr_if->set() callback and let them take care of it. + * When they're done, they again decrement data->count and wait for data.gate + * to be reset. + * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag * Everyone then enables interrupts and we all continue on. * * Note that the mechanism is the same for UP systems, too; all the SMP stuff * becomes nops. */ -static void set_mtrr(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type) +static void +set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) { struct set_mtrr_data data; unsigned long flags; @@ -218,121 +235,122 @@ static void set_mtrr(unsigned int reg, unsigned long base, data.smp_size = size; data.smp_type = type; atomic_set(&data.count, num_booting_cpus() - 1); - /* make sure data.count is visible before unleashing other CPUs */ + + /* Make sure data.count is visible before unleashing other CPUs */ smp_wmb(); - atomic_set(&data.gate,0); + atomic_set(&data.gate, 0); - /* Start the ball rolling on other CPUs */ + /* Start the ball rolling on other CPUs */ if (smp_call_function(ipi_handler, &data, 0) != 0) panic("mtrr: timed out waiting for other CPUs\n"); local_irq_save(flags); - while(atomic_read(&data.count)) + while (atomic_read(&data.count)) cpu_relax(); - /* ok, reset count and toggle gate */ + /* Ok, reset count and toggle gate */ atomic_set(&data.count, num_booting_cpus() - 1); smp_wmb(); - atomic_set(&data.gate,1); + atomic_set(&data.gate, 1); - /* do our MTRR business */ + /* Do our MTRR business */ - /* HACK! + /* + * HACK! * We use this same function to initialize the mtrrs on boot. * The state of the boot cpu's mtrrs has been saved, and we want - * to replicate across all the APs. + * to replicate across all the APs. * If we're doing that @reg is set to something special... */ - if (reg != ~0U) - mtrr_if->set(reg,base,size,type); + if (reg != ~0U) + mtrr_if->set(reg, base, size, type); - /* wait for the others */ - while(atomic_read(&data.count)) + /* Wait for the others */ + while (atomic_read(&data.count)) cpu_relax(); atomic_set(&data.count, num_booting_cpus() - 1); smp_wmb(); - atomic_set(&data.gate,0); + atomic_set(&data.gate, 0); /* * Wait here for everyone to have seen the gate change * So we're the last ones to touch 'data' */ - while(atomic_read(&data.count)) + while (atomic_read(&data.count)) cpu_relax(); local_irq_restore(flags); } /** - * mtrr_add_page - Add a memory type region - * @base: Physical base address of region in pages (in units of 4 kB!) - * @size: Physical size of region in pages (4 kB) - * @type: Type of MTRR desired - * @increment: If this is true do usage counting on the region + * mtrr_add_page - Add a memory type region + * @base: Physical base address of region in pages (in units of 4 kB!) + * @size: Physical size of region in pages (4 kB) + * @type: Type of MTRR desired + * @increment: If this is true do usage counting on the region * - * Memory type region registers control the caching on newer Intel and - * non Intel processors. This function allows drivers to request an - * MTRR is added. The details and hardware specifics of each processor's - * implementation are hidden from the caller, but nevertheless the - * caller should expect to need to provide a power of two size on an - * equivalent power of two boundary. + * Memory type region registers control the caching on newer Intel and + * non Intel processors. This function allows drivers to request an + * MTRR is added. The details and hardware specifics of each processor's + * implementation are hidden from the caller, but nevertheless the + * caller should expect to need to provide a power of two size on an + * equivalent power of two boundary. * - * If the region cannot be added either because all regions are in use - * or the CPU cannot support it a negative value is returned. On success - * the register number for this entry is returned, but should be treated - * as a cookie only. + * If the region cannot be added either because all regions are in use + * or the CPU cannot support it a negative value is returned. On success + * the register number for this entry is returned, but should be treated + * as a cookie only. * - * On a multiprocessor machine the changes are made to all processors. - * This is required on x86 by the Intel processors. + * On a multiprocessor machine the changes are made to all processors. + * This is required on x86 by the Intel processors. * - * The available types are + * The available types are * - * %MTRR_TYPE_UNCACHABLE - No caching + * %MTRR_TYPE_UNCACHABLE - No caching * - * %MTRR_TYPE_WRBACK - Write data back in bursts whenever + * %MTRR_TYPE_WRBACK - Write data back in bursts whenever * - * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts + * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts * - * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes + * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes * - * BUGS: Needs a quiet flag for the cases where drivers do not mind - * failures and do not wish system log messages to be sent. + * BUGS: Needs a quiet flag for the cases where drivers do not mind + * failures and do not wish system log messages to be sent. */ - -int mtrr_add_page(unsigned long base, unsigned long size, +int mtrr_add_page(unsigned long base, unsigned long size, unsigned int type, bool increment) { + unsigned long lbase, lsize; int i, replace, error; mtrr_type ltype; - unsigned long lbase, lsize; if (!mtrr_if) return -ENXIO; - - if ((error = mtrr_if->validate_add_page(base,size,type))) + + error = mtrr_if->validate_add_page(base, size, type); + if (error) return error; if (type >= MTRR_NUM_TYPES) { - printk(KERN_WARNING "mtrr: type: %u invalid\n", type); + pr_warning("mtrr: type: %u invalid\n", type); return -EINVAL; } - /* If the type is WC, check that this processor supports it */ + /* If the type is WC, check that this processor supports it */ if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) { - printk(KERN_WARNING - "mtrr: your processor doesn't support write-combining\n"); + pr_warning("mtrr: your processor doesn't support write-combining\n"); return -ENOSYS; } if (!size) { - printk(KERN_WARNING "mtrr: zero sized request\n"); + pr_warning("mtrr: zero sized request\n"); return -EINVAL; } if (base & size_or_mask || size & size_or_mask) { - printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n"); + pr_warning("mtrr: base or size exceeds the MTRR width\n"); return -EINVAL; } @@ -341,36 +359,40 @@ int mtrr_add_page(unsigned long base, unsigned long size, /* No CPU hotplug when we change MTRR entries */ get_online_cpus(); - /* Search for existing MTRR */ + + /* Search for existing MTRR */ mutex_lock(&mtrr_mutex); for (i = 0; i < num_var_ranges; ++i) { mtrr_if->get(i, &lbase, &lsize, <ype); - if (!lsize || base > lbase + lsize - 1 || base + size - 1 < lbase) + if (!lsize || base > lbase + lsize - 1 || + base + size - 1 < lbase) continue; - /* At this point we know there is some kind of overlap/enclosure */ + /* + * At this point we know there is some kind of + * overlap/enclosure + */ if (base < lbase || base + size - 1 > lbase + lsize - 1) { - if (base <= lbase && base + size - 1 >= lbase + lsize - 1) { + if (base <= lbase && + base + size - 1 >= lbase + lsize - 1) { /* New region encloses an existing region */ if (type == ltype) { replace = replace == -1 ? i : -2; continue; - } - else if (types_compatible(type, ltype)) + } else if (types_compatible(type, ltype)) continue; } - printk(KERN_WARNING - "mtrr: 0x%lx000,0x%lx000 overlaps existing" - " 0x%lx000,0x%lx000\n", base, size, lbase, - lsize); + pr_warning("mtrr: 0x%lx000,0x%lx000 overlaps existing" + " 0x%lx000,0x%lx000\n", base, size, lbase, + lsize); goto out; } - /* New region is enclosed by an existing region */ + /* New region is enclosed by an existing region */ if (ltype != type) { if (types_compatible(type, ltype)) continue; - printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", - base, size, mtrr_attrib_to_str(ltype), - mtrr_attrib_to_str(type)); + pr_warning("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", + base, size, mtrr_attrib_to_str(ltype), + mtrr_attrib_to_str(type)); goto out; } if (increment) @@ -378,7 +400,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, error = i; goto out; } - /* Search for an empty MTRR */ + /* Search for an empty MTRR */ i = mtrr_if->get_free_region(base, size, replace); if (i >= 0) { set_mtrr(i, base, size, type); @@ -393,8 +415,9 @@ int mtrr_add_page(unsigned long base, unsigned long size, mtrr_usage_table[replace] = 0; } } - } else - printk(KERN_INFO "mtrr: no more MTRRs available\n"); + } else { + pr_info("mtrr: no more MTRRs available\n"); + } error = i; out: mutex_unlock(&mtrr_mutex); @@ -405,10 +428,8 @@ int mtrr_add_page(unsigned long base, unsigned long size, static int mtrr_check(unsigned long base, unsigned long size) { if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { - printk(KERN_WARNING - "mtrr: size and base must be multiples of 4 kiB\n"); - printk(KERN_DEBUG - "mtrr: size: 0x%lx base: 0x%lx\n", size, base); + pr_warning("mtrr: size and base must be multiples of 4 kiB\n"); + pr_debug("mtrr: size: 0x%lx base: 0x%lx\n", size, base); dump_stack(); return -1; } @@ -416,66 +437,64 @@ static int mtrr_check(unsigned long base, unsigned long size) } /** - * mtrr_add - Add a memory type region - * @base: Physical base address of region - * @size: Physical size of region - * @type: Type of MTRR desired - * @increment: If this is true do usage counting on the region + * mtrr_add - Add a memory type region + * @base: Physical base address of region + * @size: Physical size of region + * @type: Type of MTRR desired + * @increment: If this is true do usage counting on the region * - * Memory type region registers control the caching on newer Intel and - * non Intel processors. This function allows drivers to request an - * MTRR is added. The details and hardware specifics of each processor's - * implementation are hidden from the caller, but nevertheless the - * caller should expect to need to provide a power of two size on an - * equivalent power of two boundary. + * Memory type region registers control the caching on newer Intel and + * non Intel processors. This function allows drivers to request an + * MTRR is added. The details and hardware specifics of each processor's + * implementation are hidden from the caller, but nevertheless the + * caller should expect to need to provide a power of two size on an + * equivalent power of two boundary. * - * If the region cannot be added either because all regions are in use - * or the CPU cannot support it a negative value is returned. On success - * the register number for this entry is returned, but should be treated - * as a cookie only. + * If the region cannot be added either because all regions are in use + * or the CPU cannot support it a negative value is returned. On success + * the register number for this entry is returned, but should be treated + * as a cookie only. * - * On a multiprocessor machine the changes are made to all processors. - * This is required on x86 by the Intel processors. + * On a multiprocessor machine the changes are made to all processors. + * This is required on x86 by the Intel processors. * - * The available types are + * The available types are * - * %MTRR_TYPE_UNCACHABLE - No caching + * %MTRR_TYPE_UNCACHABLE - No caching * - * %MTRR_TYPE_WRBACK - Write data back in bursts whenever + * %MTRR_TYPE_WRBACK - Write data back in bursts whenever * - * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts + * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts * - * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes + * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes * - * BUGS: Needs a quiet flag for the cases where drivers do not mind - * failures and do not wish system log messages to be sent. + * BUGS: Needs a quiet flag for the cases where drivers do not mind + * failures and do not wish system log messages to be sent. */ - -int -mtrr_add(unsigned long base, unsigned long size, unsigned int type, - bool increment) +int mtrr_add(unsigned long base, unsigned long size, unsigned int type, + bool increment) { if (mtrr_check(base, size)) return -EINVAL; return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, increment); } +EXPORT_SYMBOL(mtrr_add); /** - * mtrr_del_page - delete a memory type region - * @reg: Register returned by mtrr_add - * @base: Physical base address - * @size: Size of region + * mtrr_del_page - delete a memory type region + * @reg: Register returned by mtrr_add + * @base: Physical base address + * @size: Size of region * - * If register is supplied then base and size are ignored. This is - * how drivers should call it. + * If register is supplied then base and size are ignored. This is + * how drivers should call it. * - * Releases an MTRR region. If the usage count drops to zero the - * register is freed and the region returns to default state. - * On success the register is returned, on failure a negative error - * code. + * Releases an MTRR region. If the usage count drops to zero the + * register is freed and the region returns to default state. + * On success the register is returned, on failure a negative error + * code. */ - int mtrr_del_page(int reg, unsigned long base, unsigned long size) { int i, max; @@ -500,22 +519,22 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) } } if (reg < 0) { - printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base, - size); + pr_debug("mtrr: no MTRR for %lx000,%lx000 found\n", + base, size); goto out; } } if (reg >= max) { - printk(KERN_WARNING "mtrr: register: %d too big\n", reg); + pr_warning("mtrr: register: %d too big\n", reg); goto out; } mtrr_if->get(reg, &lbase, &lsize, <ype); if (lsize < 1) { - printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg); + pr_warning("mtrr: MTRR %d not used\n", reg); goto out; } if (mtrr_usage_table[reg] < 1) { - printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); + pr_warning("mtrr: reg: %d has count=0\n", reg); goto out; } if (--mtrr_usage_table[reg] < 1) @@ -526,33 +545,31 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) put_online_cpus(); return error; } + /** - * mtrr_del - delete a memory type region - * @reg: Register returned by mtrr_add - * @base: Physical base address - * @size: Size of region + * mtrr_del - delete a memory type region + * @reg: Register returned by mtrr_add + * @base: Physical base address + * @size: Size of region * - * If register is supplied then base and size are ignored. This is - * how drivers should call it. + * If register is supplied then base and size are ignored. This is + * how drivers should call it. * - * Releases an MTRR region. If the usage count drops to zero the - * register is freed and the region returns to default state. - * On success the register is returned, on failure a negative error - * code. + * Releases an MTRR region. If the usage count drops to zero the + * register is freed and the region returns to default state. + * On success the register is returned, on failure a negative error + * code. */ - -int -mtrr_del(int reg, unsigned long base, unsigned long size) +int mtrr_del(int reg, unsigned long base, unsigned long size) { if (mtrr_check(base, size)) return -EINVAL; return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); } - -EXPORT_SYMBOL(mtrr_add); EXPORT_SYMBOL(mtrr_del); -/* HACK ALERT! +/* + * HACK ALERT! * These should be called implicitly, but we can't yet until all the initcall * stuff is done... */ @@ -576,29 +593,28 @@ struct mtrr_value { static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; -static int mtrr_save(struct sys_device * sysdev, pm_message_t state) +static int mtrr_save(struct sys_device *sysdev, pm_message_t state) { int i; for (i = 0; i < num_var_ranges; i++) { - mtrr_if->get(i, - &mtrr_value[i].lbase, - &mtrr_value[i].lsize, - &mtrr_value[i].ltype); + mtrr_if->get(i, &mtrr_value[i].lbase, + &mtrr_value[i].lsize, + &mtrr_value[i].ltype); } return 0; } -static int mtrr_restore(struct sys_device * sysdev) +static int mtrr_restore(struct sys_device *sysdev) { int i; for (i = 0; i < num_var_ranges; i++) { - if (mtrr_value[i].lsize) - set_mtrr(i, - mtrr_value[i].lbase, - mtrr_value[i].lsize, - mtrr_value[i].ltype); + if (mtrr_value[i].lsize) { + set_mtrr(i, mtrr_value[i].lbase, + mtrr_value[i].lsize, + mtrr_value[i].ltype); + } } return 0; } @@ -615,26 +631,29 @@ int __initdata changed_by_mtrr_cleanup; /** * mtrr_bp_init - initialize mtrrs on the boot CPU * - * This needs to be called early; before any of the other CPUs are + * This needs to be called early; before any of the other CPUs are * initialized (i.e. before smp_init()). - * + * */ void __init mtrr_bp_init(void) { u32 phys_addr; + init_ifs(); phys_addr = 32; if (cpu_has_mtrr) { mtrr_if = &generic_mtrr_ops; - size_or_mask = 0xff000000; /* 36 bits */ + size_or_mask = 0xff000000; /* 36 bits */ size_and_mask = 0x00f00000; phys_addr = 36; - /* This is an AMD specific MSR, but we assume(hope?) that - Intel will implement it to when they extend the address - bus of the Xeon. */ + /* + * This is an AMD specific MSR, but we assume(hope?) that + * Intel will implement it to when they extend the address + * bus of the Xeon. + */ if (cpuid_eax(0x80000000) >= 0x80000008) { phys_addr = cpuid_eax(0x80000008) & 0xff; /* CPUID workaround for Intel 0F33/0F34 CPU */ @@ -649,9 +668,11 @@ void __init mtrr_bp_init(void) size_and_mask = ~size_or_mask & 0xfffff00000ULL; } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR && boot_cpu_data.x86 == 6) { - /* VIA C* family have Intel style MTRRs, but - don't support PAE */ - size_or_mask = 0xfff00000; /* 32 bits */ + /* + * VIA C* family have Intel style MTRRs, + * but don't support PAE + */ + size_or_mask = 0xfff00000; /* 32 bits */ size_and_mask = 0; phys_addr = 32; } @@ -694,7 +715,6 @@ void __init mtrr_bp_init(void) changed_by_mtrr_cleanup = 1; mtrr_if->set_all(); } - } } } @@ -706,12 +726,17 @@ void mtrr_ap_init(void) if (!mtrr_if || !use_intel()) return; /* - * Ideally we should hold mtrr_mutex here to avoid mtrr entries changed, - * but this routine will be called in cpu boot time, holding the lock - * breaks it. This routine is called in two cases: 1.very earily time - * of software resume, when there absolutely isn't mtrr entry changes; - * 2.cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug lock to - * prevent mtrr entry changes + * Ideally we should hold mtrr_mutex here to avoid mtrr entries + * changed, but this routine will be called in cpu boot time, + * holding the lock breaks it. + * + * This routine is called in two cases: + * + * 1. very earily time of software resume, when there absolutely + * isn't mtrr entry changes; + * + * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug + * lock to prevent mtrr entry changes */ local_irq_save(flags); @@ -732,19 +757,23 @@ static int __init mtrr_init_finialize(void) { if (!mtrr_if) return 0; + if (use_intel()) { if (!changed_by_mtrr_cleanup) mtrr_state_warn(); - } else { - /* The CPUs haven't MTRR and seem to not support SMP. They have - * specific drivers, we use a tricky method to support - * suspend/resume for them. - * TBD: is there any system with such CPU which supports - * suspend/resume? if no, we should remove the code. - */ - sysdev_driver_register(&cpu_sysdev_class, - &mtrr_sysdev_driver); + return 0; } + + /* + * The CPU has no MTRR and seems to not support SMP. They have + * specific drivers, we use a tricky method to support + * suspend/resume for them. + * + * TBD: is there any system with such CPU which supports + * suspend/resume? If no, we should remove the code. + */ + sysdev_driver_register(&cpu_sysdev_class, &mtrr_sysdev_driver); + return 0; } subsys_initcall(mtrr_init_finialize); -- cgit v1.2.3 From e3d0e69268dffb9676bf0800a60fb3573a723480 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 5 Jul 2009 09:44:11 +0200 Subject: x86: Further clean up of mtrr/generic.c Yinghai noticed that i defined BIOS_BUG_MSG but added no usage for it. The usage is to clean up this turd in generic.c: printk(KERN_WARNING "WARNING: BIOS bug: VAR MTRR %d " "contains strange UC entry under 1M, check " "with your system vendor!\n", i); Breaking printk lines in the middle looks ugly, is hard to read and breaks 'git grep'. Use the BIOS_BUG_MSG instead. Also complete the moving of structure definitions and variables to the top of the file. Reported-by: Yinghai Lu LKML-Reference: <20090703164225.GA21447@elte.hu> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/cleanup.c | 56 ++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index b8aba811b60e..315738c74aad 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -34,14 +34,37 @@ #include "mtrr.h" -/* Should be related to MTRR_VAR_RANGES nums */ -#define RANGE_NUM 256 - struct res_range { unsigned long start; unsigned long end; }; +struct var_mtrr_range_state { + unsigned long base_pfn; + unsigned long size_pfn; + mtrr_type type; +}; + +struct var_mtrr_state { + unsigned long range_startk; + unsigned long range_sizek; + unsigned long chunk_sizek; + unsigned long gran_sizek; + unsigned int reg; +}; + +/* Should be related to MTRR_VAR_RANGES nums */ +#define RANGE_NUM 256 + +static struct res_range __initdata range[RANGE_NUM]; +static int __initdata nr_range; + +static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; + +static int __initdata debug_print; +#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) + + static int __init add_range(struct res_range *range, int nr_range, unsigned long start, unsigned long end) @@ -147,18 +170,6 @@ static int __init cmp_range(const void *x1, const void *x2) return start1 - start2; } -struct var_mtrr_range_state { - unsigned long base_pfn; - unsigned long size_pfn; - mtrr_type type; -}; - -static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; - -static int __initdata debug_print; -#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) - - #define BIOS_BUG_MSG KERN_WARNING \ "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" @@ -200,9 +211,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed && (mtrr_state.enabled & 1)) { /* Var MTRR contains UC entry below 1M? Skip it: */ - printk(KERN_WARNING "WARNING: BIOS bug: VAR MTRR %d " - "contains strange UC entry under 1M, check " - "with your system vendor!\n", i); + printk(BIOS_BUG_MSG, i); if (base + size <= (1<<(20-PAGE_SHIFT))) continue; size -= (1<<(20-PAGE_SHIFT)) - base; @@ -244,9 +253,6 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, return nr_range; } -static struct res_range __initdata range[RANGE_NUM]; -static int __initdata nr_range; - #ifdef CONFIG_MTRR_SANITIZER static unsigned long __init sum_ranges(struct res_range *range, int nr_range) @@ -284,14 +290,6 @@ static int __init mtrr_cleanup_debug_setup(char *str) } early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup); -struct var_mtrr_state { - unsigned long range_startk; - unsigned long range_sizek; - unsigned long chunk_sizek; - unsigned long gran_sizek; - unsigned int reg; -}; - static void __init set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, unsigned char type, unsigned int address_bits) -- cgit v1.2.3 From 96ccd4a43a4d80c80be636cd025a69959cf47424 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 5 Jul 2009 12:47:52 +0200 Subject: genirq: Remove obsolete defines and typedefs The defines and typedefs (hw_interrupt_type, no_irq_type, irq_desc_t) have been kept around for migration reasons. The last users are gone, remove them. Signed-off-by: Thomas Gleixner --- Documentation/feature-removal-schedule.txt | 9 --------- include/linux/irq.h | 7 ------- 2 files changed, 16 deletions(-) diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index f8cd450be9aa..2af78126b053 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -394,15 +394,6 @@ Who: Thomas Gleixner ----------------------------- -What: obsolete generic irq defines and typedefs -When: 2.6.30 -Why: The defines and typedefs (hw_interrupt_type, no_irq_type, irq_desc_t) - have been kept around for migration reasons. After more than two years - it's time to remove them finally -Who: Thomas Gleixner - ---------------------------- - What: fakephp and associated sysfs files in /sys/bus/pci/slots/ When: 2011 Why: In 2.6.27, the semantics of /sys/bus/pci/slots was redefined to diff --git a/include/linux/irq.h b/include/linux/irq.h index cb2e77a3f7f7..6956df9961ab 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -219,13 +219,6 @@ static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node); -/* - * Migration helpers for obsolete names, they will go away: - */ -#define hw_interrupt_type irq_chip -#define no_irq_type no_irq_chip -typedef struct irq_desc irq_desc_t; - /* * Pick up the arch-dependent methods: */ -- cgit v1.2.3 From c71320d0c445e01555a86fa6523609db47eeaef6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 5 Jul 2009 00:22:34 +0200 Subject: genirq: Fix comment describing suspend_device_irqs() The kerneldoc comment describing suspend_device_irqs() is currently misleading, because generally the function doesn't really disable interrupt lines at the chip level. Replace it with a more accurate one. Signed-off-by: Rafael J. Wysocki LKML-Reference: <200907050022.35117.rjw@sisk.pl> Signed-off-by: Thomas Gleixner --- kernel/irq/pm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 638d8bedec14..a0bb09e79867 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -15,10 +15,10 @@ /** * suspend_device_irqs - disable all currently enabled interrupt lines * - * During system-wide suspend or hibernation device interrupts need to be - * disabled at the chip level and this function is provided for this purpose. - * It disables all interrupt lines that are enabled at the moment and sets the - * IRQ_SUSPENDED flag for them. + * During system-wide suspend or hibernation device drivers need to be prevented + * from receiving interrupts and this function is provided for this purpose. + * It marks all interrupt lines in use, except for the timer ones, as disabled + * and sets the IRQ_SUSPENDED flag for each of them. */ void suspend_device_irqs(void) { -- cgit v1.2.3 From 951ed4d36b77ba9fe1ea08fc3c59d8bb6c9bda32 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 7 Jul 2009 11:27:28 +0200 Subject: timekeeping: optimized ktime_get[_ts] for GENERIC_TIME=y The generic ktime_get function defined in kernel/hrtimer.c is suboptimial for GENERIC_TIME=y: 0) | ktime_get() { 0) | ktime_get_ts() { 0) | getnstimeofday() { 0) | read_tod_clock() { 0) 0.601 us | } 0) 1.938 us | } 0) | set_normalized_timespec() { 0) 0.602 us | } 0) 4.375 us | } 0) 5.523 us | } Overall there are two read_seqbegin/read_seqretry loops and a lot of unnecessary struct timespec calculations. ktime_get returns a nano second value which is the sum of xtime, wall_to_monotonic and the nano second delta from the clock source. ktime_get can be optimized for GENERIC_TIME=y. The new version only calls clocksource_read: 0) | ktime_get() { 0) | read_tod_clock() { 0) 0.610 us | } 0) 1.977 us | } It uses a single read_seqbegin/readseqretry loop and just adds everthing to a nano second value. ktime_get_ts is optimized in a similar fashion. [ tglx: added WARN_ON(timekeeping_suspended) as in getnstimeofday() ] Signed-off-by: Martin Schwidefsky Acked-by: john stultz LKML-Reference: <20090707112728.3005244d@skybase> Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 4 +++ kernel/time/timekeeping.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 9002958a96e7..829e0664b72e 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -48,6 +48,7 @@ #include +#ifndef CONFIG_GENERIC_TIME /** * ktime_get - get the monotonic time in ktime_t format * @@ -62,6 +63,7 @@ ktime_t ktime_get(void) return timespec_to_ktime(now); } EXPORT_SYMBOL_GPL(ktime_get); +#endif /** * ktime_get_real - get the real (wall-) time in ktime_t format @@ -106,6 +108,7 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = } }; +#ifndef CONFIG_GENERIC_TIME /** * ktime_get_ts - get the monotonic clock in timespec format * @ts: pointer to timespec variable @@ -130,6 +133,7 @@ void ktime_get_ts(struct timespec *ts) ts->tv_nsec + tomono.tv_nsec); } EXPORT_SYMBOL_GPL(ktime_get_ts); +#endif /* * Get the coarse grained time at the softirq based on xtime and diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e8c77d9c633a..7a248135c6f2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -125,6 +125,75 @@ void getnstimeofday(struct timespec *ts) EXPORT_SYMBOL(getnstimeofday); +ktime_t ktime_get(void) +{ + cycle_t cycle_now, cycle_delta; + unsigned int seq; + s64 secs, nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqbegin(&xtime_lock); + secs = xtime.tv_sec + wall_to_monotonic.tv_sec; + nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + nsecs += cyc2ns(clock, cycle_delta); + + } while (read_seqretry(&xtime_lock, seq)); + /* + * Use ktime_set/ktime_add_ns to create a proper ktime on + * 32-bit architectures without CONFIG_KTIME_SCALAR. + */ + return ktime_add_ns(ktime_set(secs, 0), nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get); + +/** + * ktime_get_ts - get the monotonic clock in timespec format + * @ts: pointer to timespec variable + * + * The function calculates the monotonic clock from the realtime + * clock and the wall_to_monotonic offset and stores the result + * in normalized timespec format in the variable pointed to by @ts. + */ +void ktime_get_ts(struct timespec *ts) +{ + cycle_t cycle_now, cycle_delta; + struct timespec tomono; + unsigned int seq; + s64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqbegin(&xtime_lock); + *ts = xtime; + tomono = wall_to_monotonic; + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + nsecs = cyc2ns(clock, cycle_delta); + + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, + ts->tv_nsec + tomono.tv_nsec + nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get_ts); + /** * do_gettimeofday - Returns the time of day in a timeval * @tv: pointer to the timeval to be set -- cgit v1.2.3 From a40f262cc21fbfd781bbddcc40b16b83a75f5f34 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Jul 2009 13:00:31 +0200 Subject: timekeeping: Move ktime_get() functions to timekeeping.c The ktime_get() functions for GENERIC_TIME=n are still located in hrtimer.c. Move them to time/timekeeping.c where they belong. LKML-Reference: Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 60 ----------------------------------------------- kernel/time/timekeeping.c | 59 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 57 insertions(+), 62 deletions(-) diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 829e0664b72e..43d151f185b6 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -48,39 +48,6 @@ #include -#ifndef CONFIG_GENERIC_TIME -/** - * ktime_get - get the monotonic time in ktime_t format - * - * returns the time in ktime_t format - */ -ktime_t ktime_get(void) -{ - struct timespec now; - - ktime_get_ts(&now); - - return timespec_to_ktime(now); -} -EXPORT_SYMBOL_GPL(ktime_get); -#endif - -/** - * ktime_get_real - get the real (wall-) time in ktime_t format - * - * returns the time in ktime_t format - */ -ktime_t ktime_get_real(void) -{ - struct timespec now; - - getnstimeofday(&now); - - return timespec_to_ktime(now); -} - -EXPORT_SYMBOL_GPL(ktime_get_real); - /* * The timer bases: * @@ -108,33 +75,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = } }; -#ifndef CONFIG_GENERIC_TIME -/** - * ktime_get_ts - get the monotonic clock in timespec format - * @ts: pointer to timespec variable - * - * The function calculates the monotonic clock from the realtime - * clock and the wall_to_monotonic offset and stores the result - * in normalized timespec format in the variable pointed to by @ts. - */ -void ktime_get_ts(struct timespec *ts) -{ - struct timespec tomono; - unsigned long seq; - - do { - seq = read_seqbegin(&xtime_lock); - getnstimeofday(ts); - tomono = wall_to_monotonic; - - } while (read_seqretry(&xtime_lock, seq)); - - set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, - ts->tv_nsec + tomono.tv_nsec); -} -EXPORT_SYMBOL_GPL(ktime_get_ts); -#endif - /* * Get the coarse grained time at the softirq based on xtime and * wall_to_monotonic. diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7a248135c6f2..02c0b2c9c674 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -290,10 +290,65 @@ static void change_clocksource(void) clock->name); */ } -#else +#else /* GENERIC_TIME */ static inline void clocksource_forward_now(void) { } static inline void change_clocksource(void) { } -#endif + +/** + * ktime_get - get the monotonic time in ktime_t format + * + * returns the time in ktime_t format + */ +ktime_t ktime_get(void) +{ + struct timespec now; + + ktime_get_ts(&now); + + return timespec_to_ktime(now); +} +EXPORT_SYMBOL_GPL(ktime_get); + +/** + * ktime_get_ts - get the monotonic clock in timespec format + * @ts: pointer to timespec variable + * + * The function calculates the monotonic clock from the realtime + * clock and the wall_to_monotonic offset and stores the result + * in normalized timespec format in the variable pointed to by @ts. + */ +void ktime_get_ts(struct timespec *ts) +{ + struct timespec tomono; + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + getnstimeofday(ts); + tomono = wall_to_monotonic; + + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, + ts->tv_nsec + tomono.tv_nsec); +} +EXPORT_SYMBOL_GPL(ktime_get_ts); +#endif /* !GENERIC_TIME */ + +/** + * ktime_get_real - get the real (wall-) time in ktime_t format + * + * returns the time in ktime_t format + */ +ktime_t ktime_get_real(void) +{ + struct timespec now; + + getnstimeofday(&now); + + return timespec_to_ktime(now); +} +EXPORT_SYMBOL_GPL(ktime_get_real); /** * getrawmonotonic - Returns the raw monotonic time in a timespec -- cgit v1.2.3 From 3adc54fa82a68be1cd1ac82ad786ee362796e50a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 30 Mar 2009 15:32:01 -0400 Subject: ring-buffer: make the buffer a true circular link list This patch changes the ring buffer data pages from using a link list head pointer, to making each buffer page point to another buffer page and never back to a "head". This makes the handling of the ring buffer less complex, since the traversing of the ring buffer pages no longer needs to account for the head pointer. This change also is needed to make the ring buffer lockless. [ Changes in version 2: - Added change that Lai Jiangshan mentioned. From: Lai Jiangshan Date: Thu, 11 Jun 2009 11:25:48 +0800 LKML-Reference: <4A30793C.6090208@cn.fujitsu.com> I'm not sure whether these 4 lines: bpage = list_entry(pages.next, struct buffer_page, list); list_del_init(&bpage->list); cpu_buffer->pages = &bpage->list; list_splice(&pages, cpu_buffer->pages); equal to these 2 lines: cpu_buffer->pages = pages.next; list_del(&pages); If there are equivalent, I think the second one are simpler. It may be not a really necessarily cleanup. What I asked is: if there are equivalent, could you use these two line: cpu_buffer->pages = pages.next; list_del(&pages); ] [ Impact: simplify the ring buffer to help make it lockless ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 49 ++++++++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bf27bb7a63e2..7c0168ad6d51 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -406,7 +406,7 @@ struct ring_buffer_per_cpu { spinlock_t reader_lock; /* serialize readers */ raw_spinlock_t lock; struct lock_class_key lock_key; - struct list_head pages; + struct list_head *pages; struct buffer_page *head_page; /* read from head */ struct buffer_page *tail_page; /* write to tail */ struct buffer_page *commit_page; /* committed pages */ @@ -498,7 +498,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); */ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) { - struct list_head *head = &cpu_buffer->pages; + struct list_head *head = cpu_buffer->pages; struct buffer_page *bpage, *tmp; if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) @@ -521,12 +521,13 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) { - struct list_head *head = &cpu_buffer->pages; struct buffer_page *bpage, *tmp; unsigned long addr; LIST_HEAD(pages); unsigned i; + WARN_ON(!nr_pages); + for (i = 0; i < nr_pages; i++) { bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); @@ -541,7 +542,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_init_page(bpage->page); } - list_splice(&pages, head); + /* + * The ring buffer page list is a circular list that does not + * start and end with a list head. All page list items point to + * other pages. + */ + cpu_buffer->pages = pages.next; + list_del(&pages); rb_check_pages(cpu_buffer); @@ -573,7 +580,6 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) spin_lock_init(&cpu_buffer->reader_lock); lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - INIT_LIST_HEAD(&cpu_buffer->pages); bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); @@ -594,7 +600,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) goto fail_free_reader; cpu_buffer->head_page - = list_entry(cpu_buffer->pages.next, struct buffer_page, list); + = list_entry(cpu_buffer->pages, struct buffer_page, list); cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; return cpu_buffer; @@ -609,15 +615,20 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) { - struct list_head *head = &cpu_buffer->pages; + struct list_head *head = cpu_buffer->pages; struct buffer_page *bpage, *tmp; free_buffer_page(cpu_buffer->reader_page); - list_for_each_entry_safe(bpage, tmp, head, list) { - list_del_init(&bpage->list); + if (head) { + list_for_each_entry_safe(bpage, tmp, head, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + bpage = list_entry(head, struct buffer_page, list); free_buffer_page(bpage); } + kfree(cpu_buffer); } @@ -760,14 +771,14 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) synchronize_sched(); for (i = 0; i < nr_pages; i++) { - if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) + if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) return; - p = cpu_buffer->pages.next; + p = cpu_buffer->pages->next; bpage = list_entry(p, struct buffer_page, list); list_del_init(&bpage->list); free_buffer_page(bpage); } - if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) + if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) return; rb_reset_cpu(cpu_buffer); @@ -795,7 +806,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, p = pages->next; bpage = list_entry(p, struct buffer_page, list); list_del_init(&bpage->list); - list_add_tail(&bpage->list, &cpu_buffer->pages); + list_add_tail(&bpage->list, cpu_buffer->pages); } rb_reset_cpu(cpu_buffer); @@ -992,9 +1003,6 @@ static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, { struct list_head *p = (*bpage)->list.next; - if (p == &cpu_buffer->pages) - p = p->next; - *bpage = list_entry(p, struct buffer_page, list); } @@ -2247,6 +2255,13 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->list.next = reader->list.next; cpu_buffer->reader_page->list.prev = reader->list.prev; + /* + * cpu_buffer->pages just needs to point to the buffer, it + * has no specific buffer page to point to. Lets move it out + * of our way so we don't accidently swap it. + */ + cpu_buffer->pages = reader->list.prev; + local_set(&cpu_buffer->reader_page->write, 0); local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); @@ -2719,7 +2734,7 @@ static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { cpu_buffer->head_page - = list_entry(cpu_buffer->pages.next, struct buffer_page, list); + = list_entry(cpu_buffer->pages, struct buffer_page, list); local_set(&cpu_buffer->head_page->write, 0); local_set(&cpu_buffer->head_page->entries, 0); local_set(&cpu_buffer->head_page->page->commit, 0); -- cgit v1.2.3 From 77ae365eca895061c8bf2b2e3ae1d9ea62869739 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 27 Mar 2009 11:00:29 -0400 Subject: ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 1 - kernel/trace/ring_buffer.c | 886 ++++++++++++++++++++++++++++++++++++-------- kernel/trace/trace.c | 3 - 3 files changed, 738 insertions(+), 152 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 29f8599e6bea..7fca71693ae7 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -170,7 +170,6 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer); unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu); unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu); unsigned long ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu); -unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu); u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu); void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7c0168ad6d51..e648ba4f70e0 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -322,6 +322,14 @@ struct buffer_data_page { unsigned char data[]; /* data of buffer page */ }; +/* + * Note, the buffer_page list must be first. The buffer pages + * are allocated in cache lines, which means that each buffer + * page will be at the beginning of a cache line, and thus + * the least significant bits will be zero. We use this to + * add flags in the list struct pointers, to make the ring buffer + * lockless. + */ struct buffer_page { struct list_head list; /* list of buffer pages */ local_t write; /* index for next write */ @@ -330,6 +338,21 @@ struct buffer_page { struct buffer_data_page *page; /* Actual data page */ }; +/* + * The buffer page counters, write and entries, must be reset + * atomically when crossing page boundaries. To synchronize this + * update, two counters are inserted into the number. One is + * the actual counter for the write position or count on the page. + * + * The other is a counter of updaters. Before an update happens + * the update partition of the counter is incremented. This will + * allow the updater to update the counter atomically. + * + * The counter is 20 bits, and the state data is 12. + */ +#define RB_WRITE_MASK 0xfffff +#define RB_WRITE_INTCNT (1 << 20) + static void rb_init_page(struct buffer_data_page *bpage) { local_set(&bpage->commit, 0); @@ -403,7 +426,7 @@ int ring_buffer_print_page_header(struct trace_seq *s) struct ring_buffer_per_cpu { int cpu; struct ring_buffer *buffer; - spinlock_t reader_lock; /* serialize readers */ + spinlock_t reader_lock; /* serialize readers */ raw_spinlock_t lock; struct lock_class_key lock_key; struct list_head *pages; @@ -411,13 +434,12 @@ struct ring_buffer_per_cpu { struct buffer_page *tail_page; /* write to tail */ struct buffer_page *commit_page; /* committed pages */ struct buffer_page *reader_page; - unsigned long nmi_dropped; - unsigned long commit_overrun; - unsigned long overrun; - unsigned long read; + local_t commit_overrun; + local_t overrun; local_t entries; local_t committing; local_t commits; + unsigned long read; u64 write_stamp; u64 read_stamp; atomic_t record_disabled; @@ -489,6 +511,385 @@ void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); +/* + * Making the ring buffer lockless makes things tricky. + * Although writes only happen on the CPU that they are on, + * and they only need to worry about interrupts. Reads can + * happen on any CPU. + * + * The reader page is always off the ring buffer, but when the + * reader finishes with a page, it needs to swap its page with + * a new one from the buffer. The reader needs to take from + * the head (writes go to the tail). But if a writer is in overwrite + * mode and wraps, it must push the head page forward. + * + * Here lies the problem. + * + * The reader must be careful to replace only the head page, and + * not another one. As described at the top of the file in the + * ASCII art, the reader sets its old page to point to the next + * page after head. It then sets the page after head to point to + * the old reader page. But if the writer moves the head page + * during this operation, the reader could end up with the tail. + * + * We use cmpxchg to help prevent this race. We also do something + * special with the page before head. We set the LSB to 1. + * + * When the writer must push the page forward, it will clear the + * bit that points to the head page, move the head, and then set + * the bit that points to the new head page. + * + * We also don't want an interrupt coming in and moving the head + * page on another writer. Thus we use the second LSB to catch + * that too. Thus: + * + * head->list->prev->next bit 1 bit 0 + * ------- ------- + * Normal page 0 0 + * Points to head page 0 1 + * New head page 1 0 + * + * Note we can not trust the prev pointer of the head page, because: + * + * +----+ +-----+ +-----+ + * | |------>| T |---X--->| N | + * | |<------| | | | + * +----+ +-----+ +-----+ + * ^ ^ | + * | +-----+ | | + * +----------| R |----------+ | + * | |<-----------+ + * +-----+ + * + * Key: ---X--> HEAD flag set in pointer + * T Tail page + * R Reader page + * N Next page + * + * (see __rb_reserve_next() to see where this happens) + * + * What the above shows is that the reader just swapped out + * the reader page with a page in the buffer, but before it + * could make the new header point back to the new page added + * it was preempted by a writer. The writer moved forward onto + * the new page added by the reader and is about to move forward + * again. + * + * You can see, it is legitimate for the previous pointer of + * the head (or any page) not to point back to itself. But only + * temporarially. + */ + +#define RB_PAGE_NORMAL 0UL +#define RB_PAGE_HEAD 1UL +#define RB_PAGE_UPDATE 2UL + + +#define RB_FLAG_MASK 3UL + +/* PAGE_MOVED is not part of the mask */ +#define RB_PAGE_MOVED 4UL + +/* + * rb_list_head - remove any bit + */ +static struct list_head *rb_list_head(struct list_head *list) +{ + unsigned long val = (unsigned long)list; + + return (struct list_head *)(val & ~RB_FLAG_MASK); +} + +/* + * rb_is_head_page - test if the give page is the head page + * + * Because the reader may move the head_page pointer, we can + * not trust what the head page is (it may be pointing to + * the reader page). But if the next page is a header page, + * its flags will be non zero. + */ +static int inline +rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *page, struct list_head *list) +{ + unsigned long val; + + val = (unsigned long)list->next; + + if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list) + return RB_PAGE_MOVED; + + return val & RB_FLAG_MASK; +} + +/* + * rb_is_reader_page + * + * The unique thing about the reader page, is that, if the + * writer is ever on it, the previous pointer never points + * back to the reader page. + */ +static int rb_is_reader_page(struct buffer_page *page) +{ + struct list_head *list = page->list.prev; + + return rb_list_head(list->next) != &page->list; +} + +/* + * rb_set_list_to_head - set a list_head to be pointing to head. + */ +static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer, + struct list_head *list) +{ + unsigned long *ptr; + + ptr = (unsigned long *)&list->next; + *ptr |= RB_PAGE_HEAD; + *ptr &= ~RB_PAGE_UPDATE; +} + +/* + * rb_head_page_activate - sets up head page + */ +static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *head; + + head = cpu_buffer->head_page; + if (!head) + return; + + /* + * Set the previous list pointer to have the HEAD flag. + */ + rb_set_list_to_head(cpu_buffer, head->list.prev); +} + +static void rb_list_head_clear(struct list_head *list) +{ + unsigned long *ptr = (unsigned long *)&list->next; + + *ptr &= ~RB_FLAG_MASK; +} + +/* + * rb_head_page_dactivate - clears head page ptr (for free list) + */ +static void +rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct list_head *hd; + + /* Go through the whole list and clear any pointers found. */ + rb_list_head_clear(cpu_buffer->pages); + + list_for_each(hd, cpu_buffer->pages) + rb_list_head_clear(hd); +} + +static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag, int new_flag) +{ + struct list_head *list; + unsigned long val = (unsigned long)&head->list; + unsigned long ret; + + list = &prev->list; + + val &= ~RB_FLAG_MASK; + + ret = (unsigned long)cmpxchg(&list->next, + val | old_flag, val | new_flag); + + /* check if the reader took the page */ + if ((ret & ~RB_FLAG_MASK) != val) + return RB_PAGE_MOVED; + + return ret & RB_FLAG_MASK; +} + +static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_UPDATE); +} + +static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_HEAD); +} + +static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_NORMAL); +} + +static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page **bpage) +{ + struct list_head *p = rb_list_head((*bpage)->list.next); + + *bpage = list_entry(p, struct buffer_page, list); +} + +static struct buffer_page * +rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *head; + struct buffer_page *page; + struct list_head *list; + int i; + + if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page)) + return NULL; + + /* sanity check */ + list = cpu_buffer->pages; + if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list)) + return NULL; + + page = head = cpu_buffer->head_page; + /* + * It is possible that the writer moves the header behind + * where we started, and we miss in one loop. + * A second loop should grab the header, but we'll do + * three loops just because I'm paranoid. + */ + for (i = 0; i < 3; i++) { + do { + if (rb_is_head_page(cpu_buffer, page, page->list.prev)) { + cpu_buffer->head_page = page; + return page; + } + rb_inc_page(cpu_buffer, &page); + } while (page != head); + } + + RB_WARN_ON(cpu_buffer, 1); + + return NULL; +} + +static int rb_head_page_replace(struct buffer_page *old, + struct buffer_page *new) +{ + unsigned long *ptr = (unsigned long *)&old->list.prev->next; + unsigned long val; + unsigned long ret; + + val = *ptr & ~RB_FLAG_MASK; + val |= RB_PAGE_HEAD; + + ret = cmpxchg(ptr, val, &new->list); + + return ret == val; +} + +/* + * rb_tail_page_update - move the tail page forward + * + * Returns 1 if moved tail page, 0 if someone else did. + */ +static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *tail_page, + struct buffer_page *next_page) +{ + struct buffer_page *old_tail; + unsigned long old_entries; + unsigned long old_write; + int ret = 0; + + /* + * The tail page now needs to be moved forward. + * + * We need to reset the tail page, but without messing + * with possible erasing of data brought in by interrupts + * that have moved the tail page and are currently on it. + * + * We add a counter to the write field to denote this. + */ + old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write); + old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries); + + /* + * Just make sure we have seen our old_write and synchronize + * with any interrupts that come in. + */ + barrier(); + + /* + * If the tail page is still the same as what we think + * it is, then it is up to us to update the tail + * pointer. + */ + if (tail_page == cpu_buffer->tail_page) { + /* Zero the write counter */ + unsigned long val = old_write & ~RB_WRITE_MASK; + unsigned long eval = old_entries & ~RB_WRITE_MASK; + + /* + * This will only succeed if an interrupt did + * not come in and change it. In which case, we + * do not want to modify it. + */ + local_cmpxchg(&next_page->write, old_write, val); + local_cmpxchg(&next_page->entries, old_entries, eval); + + /* + * No need to worry about races with clearing out the commit. + * it only can increment when a commit takes place. But that + * only happens in the outer most nested commit. + */ + local_set(&next_page->page->commit, 0); + + old_tail = cmpxchg(&cpu_buffer->tail_page, + tail_page, next_page); + + if (old_tail == tail_page) + ret = 1; + } + + return ret; +} + +static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *bpage) +{ + unsigned long val = (unsigned long)bpage; + + if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK)) + return 1; + + return 0; +} + +/** + * rb_check_list - make sure a pointer to a list has the last bits zero + */ +static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer, + struct list_head *list) +{ + if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev)) + return 1; + if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next)) + return 1; + return 0; +} + /** * check_pages - integrity check of buffer pages * @cpu_buffer: CPU buffer with pages to test @@ -501,11 +902,16 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) struct list_head *head = cpu_buffer->pages; struct buffer_page *bpage, *tmp; + rb_head_page_deactivate(cpu_buffer); + if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) return -1; if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) return -1; + if (rb_check_list(cpu_buffer, head)) + return -1; + list_for_each_entry_safe(bpage, tmp, head, list) { if (RB_WARN_ON(cpu_buffer, bpage->list.next->prev != &bpage->list)) @@ -513,8 +919,12 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) if (RB_WARN_ON(cpu_buffer, bpage->list.prev->next != &bpage->list)) return -1; + if (rb_check_list(cpu_buffer, &bpage->list)) + return -1; } + rb_head_page_activate(cpu_buffer); + return 0; } @@ -533,6 +943,9 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); if (!bpage) goto free_pages; + + rb_check_bpage(cpu_buffer, bpage); + list_add(&bpage->list, &pages); addr = __get_free_page(GFP_KERNEL); @@ -586,6 +999,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) if (!bpage) goto fail_free_buffer; + rb_check_bpage(cpu_buffer, bpage); + cpu_buffer->reader_page = bpage; addr = __get_free_page(GFP_KERNEL); if (!addr) @@ -603,6 +1018,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) = list_entry(cpu_buffer->pages, struct buffer_page, list); cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; + rb_head_page_activate(cpu_buffer); + return cpu_buffer; fail_free_reader: @@ -620,6 +1037,8 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) free_buffer_page(cpu_buffer->reader_page); + rb_head_page_deactivate(cpu_buffer); + if (head) { list_for_each_entry_safe(bpage, tmp, head, list) { list_del_init(&bpage->list); @@ -770,6 +1189,8 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) atomic_inc(&cpu_buffer->record_disabled); synchronize_sched(); + rb_head_page_deactivate(cpu_buffer); + for (i = 0; i < nr_pages; i++) { if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) return; @@ -800,6 +1221,9 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, atomic_inc(&cpu_buffer->record_disabled); synchronize_sched(); + spin_lock_irq(&cpu_buffer->reader_lock); + rb_head_page_deactivate(cpu_buffer); + for (i = 0; i < nr_pages; i++) { if (RB_WARN_ON(cpu_buffer, list_empty(pages))) return; @@ -809,6 +1233,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add_tail(&bpage->list, cpu_buffer->pages); } rb_reset_cpu(cpu_buffer); + spin_unlock_irq(&cpu_buffer->reader_lock); rb_check_pages(cpu_buffer); @@ -958,22 +1383,15 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->read); } -static inline struct ring_buffer_event * -rb_head_event(struct ring_buffer_per_cpu *cpu_buffer) -{ - return __rb_page_index(cpu_buffer->head_page, - cpu_buffer->head_page->read); -} - static inline struct ring_buffer_event * rb_iter_head_event(struct ring_buffer_iter *iter) { return __rb_page_index(iter->head_page, iter->head); } -static inline unsigned rb_page_write(struct buffer_page *bpage) +static inline unsigned long rb_page_write(struct buffer_page *bpage) { - return local_read(&bpage->write); + return local_read(&bpage->write) & RB_WRITE_MASK; } static inline unsigned rb_page_commit(struct buffer_page *bpage) @@ -981,6 +1399,11 @@ static inline unsigned rb_page_commit(struct buffer_page *bpage) return local_read(&bpage->page->commit); } +static inline unsigned long rb_page_entries(struct buffer_page *bpage) +{ + return local_read(&bpage->entries) & RB_WRITE_MASK; +} + /* Size is determined by what has been commited */ static inline unsigned rb_page_size(struct buffer_page *bpage) { @@ -993,19 +1416,6 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) return rb_page_commit(cpu_buffer->commit_page); } -static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer) -{ - return rb_page_commit(cpu_buffer->head_page); -} - -static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page **bpage) -{ - struct list_head *p = (*bpage)->list.next; - - *bpage = list_entry(p, struct buffer_page, list); -} - static inline unsigned rb_event_index(struct ring_buffer_event *event) { @@ -1031,6 +1441,8 @@ rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, static void rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) { + unsigned long max_count; + /* * We only race with interrupts and NMIs on this CPU. * If we own the commit event, then we can commit @@ -1040,9 +1452,16 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) * assign the commit to the tail. */ again: + max_count = cpu_buffer->buffer->pages * 100; + while (cpu_buffer->commit_page != cpu_buffer->tail_page) { - cpu_buffer->commit_page->page->commit = - cpu_buffer->commit_page->write; + if (RB_WARN_ON(cpu_buffer, !(--max_count))) + return; + if (RB_WARN_ON(cpu_buffer, + rb_is_reader_page(cpu_buffer->tail_page))) + return; + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); cpu_buffer->write_stamp = cpu_buffer->commit_page->page->time_stamp; @@ -1051,8 +1470,12 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) } while (rb_commit_index(cpu_buffer) != rb_page_write(cpu_buffer->commit_page)) { - cpu_buffer->commit_page->page->commit = - cpu_buffer->commit_page->write; + + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + RB_WARN_ON(cpu_buffer, + local_read(&cpu_buffer->commit_page->page->commit) & + ~RB_WRITE_MASK); barrier(); } @@ -1085,7 +1508,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) * to the head page instead of next. */ if (iter->head_page == cpu_buffer->reader_page) - iter->head_page = cpu_buffer->head_page; + iter->head_page = rb_set_head_page(cpu_buffer); else rb_inc_page(cpu_buffer, &iter->head_page); @@ -1129,6 +1552,163 @@ rb_update_event(struct ring_buffer_event *event, } } +/* + * rb_handle_head_page - writer hit the head page + * + * Returns: +1 to retry page + * 0 to continue + * -1 on error + */ +static int +rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *tail_page, + struct buffer_page *next_page) +{ + struct buffer_page *new_head; + int entries; + int type; + int ret; + + entries = rb_page_entries(next_page); + + /* + * The hard part is here. We need to move the head + * forward, and protect against both readers on + * other CPUs and writers coming in via interrupts. + */ + type = rb_head_page_set_update(cpu_buffer, next_page, tail_page, + RB_PAGE_HEAD); + + /* + * type can be one of four: + * NORMAL - an interrupt already moved it for us + * HEAD - we are the first to get here. + * UPDATE - we are the interrupt interrupting + * a current move. + * MOVED - a reader on another CPU moved the next + * pointer to its reader page. Give up + * and try again. + */ + + switch (type) { + case RB_PAGE_HEAD: + /* + * We changed the head to UPDATE, thus + * it is our responsibility to update + * the counters. + */ + local_add(entries, &cpu_buffer->overrun); + + /* + * The entries will be zeroed out when we move the + * tail page. + */ + + /* still more to do */ + break; + + case RB_PAGE_UPDATE: + /* + * This is an interrupt that interrupt the + * previous update. Still more to do. + */ + break; + case RB_PAGE_NORMAL: + /* + * An interrupt came in before the update + * and processed this for us. + * Nothing left to do. + */ + return 1; + case RB_PAGE_MOVED: + /* + * The reader is on another CPU and just did + * a swap with our next_page. + * Try again. + */ + return 1; + default: + RB_WARN_ON(cpu_buffer, 1); /* WTF??? */ + return -1; + } + + /* + * Now that we are here, the old head pointer is + * set to UPDATE. This will keep the reader from + * swapping the head page with the reader page. + * The reader (on another CPU) will spin till + * we are finished. + * + * We just need to protect against interrupts + * doing the job. We will set the next pointer + * to HEAD. After that, we set the old pointer + * to NORMAL, but only if it was HEAD before. + * otherwise we are an interrupt, and only + * want the outer most commit to reset it. + */ + new_head = next_page; + rb_inc_page(cpu_buffer, &new_head); + + ret = rb_head_page_set_head(cpu_buffer, new_head, next_page, + RB_PAGE_NORMAL); + + /* + * Valid returns are: + * HEAD - an interrupt came in and already set it. + * NORMAL - One of two things: + * 1) We really set it. + * 2) A bunch of interrupts came in and moved + * the page forward again. + */ + switch (ret) { + case RB_PAGE_HEAD: + case RB_PAGE_NORMAL: + /* OK */ + break; + default: + RB_WARN_ON(cpu_buffer, 1); + return -1; + } + + /* + * It is possible that an interrupt came in, + * set the head up, then more interrupts came in + * and moved it again. When we get back here, + * the page would have been set to NORMAL but we + * just set it back to HEAD. + * + * How do you detect this? Well, if that happened + * the tail page would have moved. + */ + if (ret == RB_PAGE_NORMAL) { + /* + * If the tail had moved passed next, then we need + * to reset the pointer. + */ + if (cpu_buffer->tail_page != tail_page && + cpu_buffer->tail_page != next_page) + rb_head_page_set_normal(cpu_buffer, new_head, + next_page, + RB_PAGE_HEAD); + } + + /* + * If this was the outer most commit (the one that + * changed the original pointer from HEAD to UPDATE), + * then it is up to us to reset it to NORMAL. + */ + if (type == RB_PAGE_HEAD) { + ret = rb_head_page_set_normal(cpu_buffer, next_page, + tail_page, + RB_PAGE_UPDATE); + if (RB_WARN_ON(cpu_buffer, + ret != RB_PAGE_UPDATE)) + return -1; + } + + return 0; +} + static unsigned rb_calculate_event_length(unsigned length) { struct ring_buffer_event event; /* Used only for sizeof array */ @@ -1207,96 +1787,93 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *commit_page, struct buffer_page *tail_page, u64 *ts) { - struct buffer_page *next_page, *head_page, *reader_page; struct ring_buffer *buffer = cpu_buffer->buffer; - bool lock_taken = false; - unsigned long flags; + struct buffer_page *next_page; + int ret; next_page = tail_page; - local_irq_save(flags); - /* - * Since the write to the buffer is still not - * fully lockless, we must be careful with NMIs. - * The locks in the writers are taken when a write - * crosses to a new page. The locks protect against - * races with the readers (this will soon be fixed - * with a lockless solution). - * - * Because we can not protect against NMIs, and we - * want to keep traces reentrant, we need to manage - * what happens when we are in an NMI. - * - * NMIs can happen after we take the lock. - * If we are in an NMI, only take the lock - * if it is not already taken. Otherwise - * simply fail. - */ - if (unlikely(in_nmi())) { - if (!__raw_spin_trylock(&cpu_buffer->lock)) { - cpu_buffer->nmi_dropped++; - goto out_reset; - } - } else - __raw_spin_lock(&cpu_buffer->lock); - - lock_taken = true; - rb_inc_page(cpu_buffer, &next_page); - head_page = cpu_buffer->head_page; - reader_page = cpu_buffer->reader_page; - - /* we grabbed the lock before incrementing */ - if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) - goto out_reset; - /* * If for some reason, we had an interrupt storm that made * it all the way around the buffer, bail, and warn * about it. */ if (unlikely(next_page == commit_page)) { - cpu_buffer->commit_overrun++; + local_inc(&cpu_buffer->commit_overrun); goto out_reset; } - if (next_page == head_page) { - if (!(buffer->flags & RB_FL_OVERWRITE)) - goto out_reset; - - /* tail_page has not moved yet? */ - if (tail_page == cpu_buffer->tail_page) { - /* count overflows */ - cpu_buffer->overrun += - local_read(&head_page->entries); + /* + * This is where the fun begins! + * + * We are fighting against races between a reader that + * could be on another CPU trying to swap its reader + * page with the buffer head. + * + * We are also fighting against interrupts coming in and + * moving the head or tail on us as well. + * + * If the next page is the head page then we have filled + * the buffer, unless the commit page is still on the + * reader page. + */ + if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) { - rb_inc_page(cpu_buffer, &head_page); - cpu_buffer->head_page = head_page; - cpu_buffer->head_page->read = 0; + /* + * If the commit is not on the reader page, then + * move the header page. + */ + if (!rb_is_reader_page(cpu_buffer->commit_page)) { + /* + * If we are not in overwrite mode, + * this is easy, just stop here. + */ + if (!(buffer->flags & RB_FL_OVERWRITE)) + goto out_reset; + + ret = rb_handle_head_page(cpu_buffer, + tail_page, + next_page); + if (ret < 0) + goto out_reset; + if (ret) + goto out_again; + } else { + /* + * We need to be careful here too. The + * commit page could still be on the reader + * page. We could have a small buffer, and + * have filled up the buffer with events + * from interrupts and such, and wrapped. + * + * Note, if the tail page is also the on the + * reader_page, we let it move out. + */ + if (unlikely((cpu_buffer->commit_page != + cpu_buffer->tail_page) && + (cpu_buffer->commit_page == + cpu_buffer->reader_page))) { + local_inc(&cpu_buffer->commit_overrun); + goto out_reset; + } } } - /* - * If the tail page is still the same as what we think - * it is, then it is up to us to update the tail - * pointer. - */ - if (tail_page == cpu_buffer->tail_page) { - local_set(&next_page->write, 0); - local_set(&next_page->entries, 0); - local_set(&next_page->page->commit, 0); - cpu_buffer->tail_page = next_page; - - /* reread the time stamp */ + ret = rb_tail_page_update(cpu_buffer, tail_page, next_page); + if (ret) { + /* + * Nested commits always have zero deltas, so + * just reread the time stamp + */ *ts = rb_time_stamp(buffer, cpu_buffer->cpu); - cpu_buffer->tail_page->page->time_stamp = *ts; + next_page->page->time_stamp = *ts; } - rb_reset_tail(cpu_buffer, tail_page, tail, length); + out_again: - __raw_spin_unlock(&cpu_buffer->lock); - local_irq_restore(flags); + rb_reset_tail(cpu_buffer, tail_page, tail, length); /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); @@ -1305,9 +1882,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, /* reset write */ rb_reset_tail(cpu_buffer, tail_page, tail, length); - if (likely(lock_taken)) - __raw_spin_unlock(&cpu_buffer->lock); - local_irq_restore(flags); return NULL; } @@ -1324,6 +1898,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, barrier(); tail_page = cpu_buffer->tail_page; write = local_add_return(length, &tail_page->write); + + /* set write to only the index of the write */ + write &= RB_WRITE_MASK; tail = write - length; /* See if we shot pass the end of this buffer page */ @@ -1368,12 +1945,16 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, bpage = cpu_buffer->tail_page; if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { + unsigned long write_mask = + local_read(&bpage->write) & ~RB_WRITE_MASK; /* * This is on the tail page. It is possible that * a write could come in and move the tail page * and write to the next page. That is fine * because we just shorten what is on this page. */ + old_index += write_mask; + new_index += write_mask; index = local_cmpxchg(&bpage->write, old_index, new_index); if (index == old_index) return 1; @@ -1882,9 +2463,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_write); static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = cpu_buffer->reader_page; - struct buffer_page *head = cpu_buffer->head_page; + struct buffer_page *head = rb_set_head_page(cpu_buffer); struct buffer_page *commit = cpu_buffer->commit_page; + /* In case of error, head will be NULL */ + if (unlikely(!head)) + return 1; + return reader->read == rb_page_commit(reader) && (commit == reader || (commit == head && @@ -1975,7 +2560,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun) + ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun)) - cpu_buffer->read; return ret; @@ -1996,32 +2581,12 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = cpu_buffer->overrun; + ret = local_read(&cpu_buffer->overrun); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); -/** - * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped - * @buffer: The ring buffer - * @cpu: The per CPU buffer to get the number of overruns from - */ -unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu) -{ - struct ring_buffer_per_cpu *cpu_buffer; - unsigned long ret; - - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return 0; - - cpu_buffer = buffer->buffers[cpu]; - ret = cpu_buffer->nmi_dropped; - - return ret; -} -EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu); - /** * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits * @buffer: The ring buffer @@ -2037,7 +2602,7 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = cpu_buffer->commit_overrun; + ret = local_read(&cpu_buffer->commit_overrun); return ret; } @@ -2060,7 +2625,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; entries += (local_read(&cpu_buffer->entries) - - cpu_buffer->overrun) - cpu_buffer->read; + local_read(&cpu_buffer->overrun)) - cpu_buffer->read; } return entries; @@ -2083,7 +2648,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer) /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - overruns += cpu_buffer->overrun; + overruns += local_read(&cpu_buffer->overrun); } return overruns; @@ -2096,8 +2661,10 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) /* Iterator usage is expected to have record disabled */ if (list_empty(&cpu_buffer->reader_page->list)) { - iter->head_page = cpu_buffer->head_page; - iter->head = cpu_buffer->head_page->read; + iter->head_page = rb_set_head_page(cpu_buffer); + if (unlikely(!iter->head_page)) + return; + iter->head = iter->head_page->read; } else { iter->head_page = cpu_buffer->reader_page; iter->head = cpu_buffer->reader_page->read; @@ -2214,6 +2781,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) struct buffer_page *reader = NULL; unsigned long flags; int nr_loops = 0; + int ret; local_irq_save(flags); __raw_spin_lock(&cpu_buffer->lock); @@ -2247,11 +2815,17 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) goto out; /* - * Splice the empty reader page into the list around the head. * Reset the reader page to size zero. */ + local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->entries, 0); + local_set(&cpu_buffer->reader_page->page->commit, 0); - reader = cpu_buffer->head_page; + spin: + /* + * Splice the empty reader page into the list around the head. + */ + reader = rb_set_head_page(cpu_buffer); cpu_buffer->reader_page->list.next = reader->list.next; cpu_buffer->reader_page->list.prev = reader->list.prev; @@ -2262,22 +2836,35 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) */ cpu_buffer->pages = reader->list.prev; - local_set(&cpu_buffer->reader_page->write, 0); - local_set(&cpu_buffer->reader_page->entries, 0); - local_set(&cpu_buffer->reader_page->page->commit, 0); + /* The reader page will be pointing to the new head */ + rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list); - /* Make the reader page now replace the head */ - reader->list.prev->next = &cpu_buffer->reader_page->list; - reader->list.next->prev = &cpu_buffer->reader_page->list; + /* + * Here's the tricky part. + * + * We need to move the pointer past the header page. + * But we can only do that if a writer is not currently + * moving it. The page before the header page has the + * flag bit '1' set if it is pointing to the page we want. + * but if the writer is in the process of moving it + * than it will be '2' or already moved '0'. + */ + + ret = rb_head_page_replace(reader, cpu_buffer->reader_page); /* - * If the tail is on the reader, then we must set the head - * to the inserted page, otherwise we set it one before. + * If we did not convert it, then we must try again. */ - cpu_buffer->head_page = cpu_buffer->reader_page; + if (!ret) + goto spin; - if (cpu_buffer->commit_page != reader) - rb_inc_page(cpu_buffer, &cpu_buffer->head_page); + /* + * Yeah! We succeeded in replacing the page. + * + * Now make the new head point back to the reader page. + */ + reader->list.next->prev = &cpu_buffer->reader_page->list; + rb_inc_page(cpu_buffer, &cpu_buffer->head_page); /* Finally update the reader page to the new head */ cpu_buffer->reader_page = reader; @@ -2733,6 +3320,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_size); static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { + rb_head_page_deactivate(cpu_buffer); + cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); local_set(&cpu_buffer->head_page->write, 0); @@ -2750,16 +3339,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->read = 0; - cpu_buffer->nmi_dropped = 0; - cpu_buffer->commit_overrun = 0; - cpu_buffer->overrun = 0; - cpu_buffer->read = 0; + local_set(&cpu_buffer->commit_overrun, 0); + local_set(&cpu_buffer->overrun, 0); local_set(&cpu_buffer->entries, 0); local_set(&cpu_buffer->committing, 0); local_set(&cpu_buffer->commits, 0); + cpu_buffer->read = 0; cpu_buffer->write_stamp = 0; cpu_buffer->read_stamp = 0; + + rb_head_page_activate(cpu_buffer); } /** @@ -3107,7 +3697,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, read = 0; } else { /* update the entry counter */ - cpu_buffer->read += local_read(&reader->entries); + cpu_buffer->read += rb_page_entries(reader); /* swap the pages */ rb_init_page(bpage); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bdb3afc8b306..b591f7a1bd7b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3630,9 +3630,6 @@ tracing_stats_read(struct file *filp, char __user *ubuf, cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); trace_seq_printf(s, "commit overrun: %ld\n", cnt); - cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu); - trace_seq_printf(s, "nmi dropped: %ld\n", cnt); - count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); kfree(s); -- cgit v1.2.3 From 8b2c70d1e43074cc06afe99b0de12b686d9c9d02 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 10 Jun 2009 15:45:41 -0400 Subject: ring-buffer: add design document This adds the design document for the ring buffer and also explains how it is designed to have lockless writes. Signed-off-by: Steven Rostedt --- Documentation/trace/ring-buffer-design.txt | 955 +++++++++++++++++++++++++++++ 1 file changed, 955 insertions(+) create mode 100644 Documentation/trace/ring-buffer-design.txt diff --git a/Documentation/trace/ring-buffer-design.txt b/Documentation/trace/ring-buffer-design.txt new file mode 100644 index 000000000000..5b1d23d604c5 --- /dev/null +++ b/Documentation/trace/ring-buffer-design.txt @@ -0,0 +1,955 @@ + Lockless Ring Buffer Design + =========================== + +Copyright 2009 Red Hat Inc. + Author: Steven Rostedt + License: The GNU Free Documentation License, Version 1.2 + (dual licensed under the GPL v2) +Reviewers: Mathieu Desnoyers, Huang Ying, Hidetoshi Seto, + and Frederic Weisbecker. + + +Written for: 2.6.31 + +Terminology used in this Document +--------------------------------- + +tail - where new writes happen in the ring buffer. + +head - where new reads happen in the ring buffer. + +producer - the task that writes into the ring buffer (same as writer) + +writer - same as producer + +consumer - the task that reads from the buffer (same as reader) + +reader - same as consumer. + +reader_page - A page outside the ring buffer used solely (for the most part) + by the reader. + +head_page - a pointer to the page that the reader will use next + +tail_page - a pointer to the page that will be written to next + +commit_page - a pointer to the page with the last finished non nested write. + +cmpxchg - hardware assisted atomic transaction that performs the following: + + A = B iff previous A == C + + R = cmpxchg(A, C, B) is saying that we replace A with B if and only if + current A is equal to C, and we put the old (current) A into R + + R gets the previous A regardless if A is updated with B or not. + + To see if the update was successful a compare of R == C may be used. + +The Generic Ring Buffer +----------------------- + +The ring buffer can be used in either an overwrite mode or in +producer/consumer mode. + +Producer/consumer mode is where the producer were to fill up the +buffer before the consumer could free up anything, the producer +will stop writing to the buffer. This will lose most recent events. + +Overwrite mode is where the produce were to fill up the buffer +before the consumer could free up anything, the producer will +overwrite the older data. This will lose the oldest events. + +No two writers can write at the same time (on the same per cpu buffer), +but a writer may interrupt another writer, but it must finish writing +before the previous writer may continue. This is very important to the +algorithm. The writers act like a "stack". The way interrupts works +enforces this behavior. + + + writer1 start + writer2 start + writer3 start + writer3 finishes + writer2 finishes + writer1 finishes + +This is very much like a writer being preempted by an interrupt and +the interrupt doing a write as well. + +Readers can happen at any time. But no two readers may run at the +same time, nor can a reader preempt/interrupt another reader. A reader +can not preempt/interrupt a writer, but it may read/consume from the +buffer at the same time as a writer is writing, but the reader must be +on another processor to do so. A reader may read on its own processor +and can be preempted by a writer. + +A writer can preempt a reader, but a reader can not preempt a writer. +But a reader can read the buffer at the same time (on another processor) +as a writer. + +The ring buffer is made up of a list of pages held together by a link list. + +At initialization a reader page is allocated for the reader that is not +part of the ring buffer. + +The head_page, tail_page and commit_page are all initialized to point +to the same page. + +The reader page is initialized to have its next pointer pointing to +the head page, and its previous pointer pointing to a page before +the head page. + +The reader has its own page to use. At start up time, this page is +allocated but is not attached to the list. When the reader wants +to read from the buffer, if its page is empty (like it is on start up) +it will swap its page with the head_page. The old reader page will +become part of the ring buffer and the head_page will be removed. +The page after the inserted page (old reader_page) will become the +new head page. + +Once the new page is given to the reader, the reader could do what +it wants with it, as long as a writer has left that page. + +A sample of how the reader page is swapped: Note this does not +show the head page in the buffer, it is for demonstrating a swap +only. + + +------+ + |reader| RING BUFFER + |page | + +------+ + +---+ +---+ +---+ + | |-->| |-->| | + | |<--| |<--| | + +---+ +---+ +---+ + ^ | ^ | + | +-------------+ | + +-----------------+ + + + +------+ + |reader| RING BUFFER + |page |-------------------+ + +------+ v + | +---+ +---+ +---+ + | | |-->| |-->| | + | | |<--| |<--| |<-+ + | +---+ +---+ +---+ | + | ^ | ^ | | + | | +-------------+ | | + | +-----------------+ | + +------------------------------------+ + + +------+ + |reader| RING BUFFER + |page |-------------------+ + +------+ <---------------+ v + | ^ +---+ +---+ +---+ + | | | |-->| |-->| | + | | | | | |<--| |<-+ + | | +---+ +---+ +---+ | + | | | ^ | | + | | +-------------+ | | + | +-----------------------------+ | + +------------------------------------+ + + +------+ + |buffer| RING BUFFER + |page |-------------------+ + +------+ <---------------+ v + | ^ +---+ +---+ +---+ + | | | | | |-->| | + | | New | | | |<--| |<-+ + | | Reader +---+ +---+ +---+ | + | | page ----^ | | + | | | | + | +-----------------------------+ | + +------------------------------------+ + + + +It is possible that the page swapped is the commit page and the tail page, +if what is in the ring buffer is less than what is held in a buffer page. + + + reader page commit page tail page + | | | + v | | + +---+ | | + | |<----------+ | + | |<------------------------+ + | |------+ + +---+ | + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +This case is still valid for this algorithm. +When the writer leaves the page, it simply goes into the ring buffer +since the reader page still points to the next location in the ring +buffer. + + +The main pointers: + + reader page - The page used solely by the reader and is not part + of the ring buffer (may be swapped in) + + head page - the next page in the ring buffer that will be swapped + with the reader page. + + tail page - the page where the next write will take place. + + commit page - the page that last finished a write. + +The commit page only is updated by the outer most writer in the +writer stack. A writer that preempts another writer will not move the +commit page. + +When data is written into the ring buffer, a position is reserved +in the ring buffer and passed back to the writer. When the writer +is finished writing data into that position, it commits the write. + +Another write (or a read) may take place at anytime during this +transaction. If another write happens it must finish before continuing +with the previous write. + + + Write reserve: + + Buffer page + +---------+ + |written | + +---------+ <--- given back to writer (current commit) + |reserved | + +---------+ <--- tail pointer + | empty | + +---------+ + + Write commit: + + Buffer page + +---------+ + |written | + +---------+ + |written | + +---------+ <--- next positon for write (current commit) + | empty | + +---------+ + + + If a write happens after the first reserve: + + Buffer page + +---------+ + |written | + +---------+ <-- current commit + |reserved | + +---------+ <--- given back to second writer + |reserved | + +---------+ <--- tail pointer + + After second writer commits: + + + Buffer page + +---------+ + |written | + +---------+ <--(last full commit) + |reserved | + +---------+ + |pending | + |commit | + +---------+ <--- tail pointer + + When the first writer commits: + + Buffer page + +---------+ + |written | + +---------+ + |written | + +---------+ + |written | + +---------+ <--(last full commit and tail pointer) + + +The commit pointer points to the last write location that was +committed without preempting another write. When a write that +preempted another write is committed, it only becomes a pending commit +and will not be a full commit till all writes have been committed. + +The commit page points to the page that has the last full commit. +The tail page points to the page with the last write (before +committing). + +The tail page is always equal to or after the commit page. It may +be several pages ahead. If the tail page catches up to the commit +page then no more writes may take place (regardless of the mode +of the ring buffer: overwrite and produce/consumer). + +The order of pages are: + + head page + commit page + tail page + +Possible scenario: + tail page + head page commit page | + | | | + v v v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +There is a special case that the head page is after either the commit page +and possibly the tail page. That is when the commit (and tail) page has been +swapped with the reader page. This is because the head page is always +part of the ring buffer, but the reader page is not. When ever there +has been less than a full page that has been committed inside the ring buffer, +and a reader swaps out a page, it will be swapping out the commit page. + + + reader page commit page tail page + | | | + v | | + +---+ | | + | |<----------+ | + | |<------------------------+ + | |------+ + +---+ | + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + ^ + | + head page + + +In this case, the head page will not move when the tail and commit +move back into the ring buffer. + +The reader can not swap a page into the ring buffer if the commit page +is still on that page. If the read meets the last commit (real commit +not pending or reserved), then there is nothing more to read. +The buffer is considered empty until another full commit finishes. + +When the tail meets the head page, if the buffer is in overwrite mode, +the head page will be pushed ahead one. If the buffer is in producer/consumer +mode, the write will fail. + +Overwrite mode: + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + ^ + | + head page + + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + ^ + | + head page + + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + ^ + | + head page + +Note, the reader page will still point to the previous head page. +But when a swap takes place, it will use the most recent head page. + + +Making the Ring Buffer Lockless: +-------------------------------- + +The main idea behind the lockless algorithm is to combine the moving +of the head_page pointer with the swapping of pages with the reader. +State flags are placed inside the pointer to the page. To do this, +each page must be aligned in memory by 4 bytes. This will allow the 2 +least significant bits of the address to be used as flags. Since +they will always be zero for the address. To get the address, +simply mask out the flags. + + MASK = ~3 + + address & MASK + +Two flags will be kept by these two bits: + + HEADER - the page being pointed to is a head page + + UPDATE - the page being pointed to is being updated by a writer + and was or is about to be a head page. + + + reader page + | + v + +---+ + | |------+ + +---+ | + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-H->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + + +The above pointer "-H->" would have the HEADER flag set. That is +the next page is the next page to be swapped out by the reader. +This pointer means the next page is the head page. + +When the tail page meets the head pointer, it will use cmpxchg to +change the pointer to the UPDATE state: + + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-H->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +"-U->" represents a pointer in the UPDATE state. + +Any access to the reader will need to take some sort of lock to serialize +the readers. But the writers will never take a lock to write to the +ring buffer. This means we only need to worry about a single reader, +and writes only preempt in "stack" formation. + +When the reader tries to swap the page with the ring buffer, it +will also use cmpxchg. If the flag bit in the pointer to the +head page does not have the HEADER flag set, the compare will fail +and the reader will need to look for the new head page and try again. +Note, the flag UPDATE and HEADER are never set at the same time. + +The reader swaps the reader page as follows: + + +------+ + |reader| RING BUFFER + |page | + +------+ + +---+ +---+ +---+ + | |--->| |--->| | + | |<---| |<---| | + +---+ +---+ +---+ + ^ | ^ | + | +---------------+ | + +-----H-------------+ + +The reader sets the reader page next pointer as HEADER to the page after +the head page. + + + +------+ + |reader| RING BUFFER + |page |-------H-----------+ + +------+ v + | +---+ +---+ +---+ + | | |--->| |--->| | + | | |<---| |<---| |<-+ + | +---+ +---+ +---+ | + | ^ | ^ | | + | | +---------------+ | | + | +-----H-------------+ | + +--------------------------------------+ + +It does a cmpxchg with the pointer to the previous head page to make it +point to the reader page. Note that the new pointer does not have the HEADER +flag set. This action atomically moves the head page forward. + + +------+ + |reader| RING BUFFER + |page |-------H-----------+ + +------+ v + | ^ +---+ +---+ +---+ + | | | |-->| |-->| | + | | | |<--| |<--| |<-+ + | | +---+ +---+ +---+ | + | | | ^ | | + | | +-------------+ | | + | +-----------------------------+ | + +------------------------------------+ + +After the new head page is set, the previous pointer of the head page is +updated to the reader page. + + +------+ + |reader| RING BUFFER + |page |-------H-----------+ + +------+ <---------------+ v + | ^ +---+ +---+ +---+ + | | | |-->| |-->| | + | | | | | |<--| |<-+ + | | +---+ +---+ +---+ | + | | | ^ | | + | | +-------------+ | | + | +-----------------------------+ | + +------------------------------------+ + + +------+ + |buffer| RING BUFFER + |page |-------H-----------+ <--- New head page + +------+ <---------------+ v + | ^ +---+ +---+ +---+ + | | | | | |-->| | + | | New | | | |<--| |<-+ + | | Reader +---+ +---+ +---+ | + | | page ----^ | | + | | | | + | +-----------------------------+ | + +------------------------------------+ + +Another important point. The page that the reader page points back to +by its previous pointer (the one that now points to the new head page) +never points back to the reader page. That is because the reader page is +not part of the ring buffer. Traversing the ring buffer via the next pointers +will always stay in the ring buffer. Traversing the ring buffer via the +prev pointers may not. + +Note, the way to determine a reader page is simply by examining the previous +pointer of the page. If the next pointer of the previous page does not +point back to the original page, then the original page is a reader page: + + + +--------+ + | reader | next +----+ + | page |-------->| |<====== (buffer page) + +--------+ +----+ + | | ^ + | v | next + prev | +----+ + +------------->| | + +----+ + +The way the head page moves forward: + +When the tail page meets the head page and the buffer is in overwrite mode +and more writes take place, the head page must be moved forward before the +writer may move the tail page. The way this is done is that the writer +performs a cmpxchg to convert the pointer to the head page from the HEADER +flag to have the UPDATE flag set. Once this is done, the reader will +not be able to swap the head page from the buffer, nor will it be able to +move the head page, until the writer is finished with the move. + +This eliminates any races that the reader can have on the writer. The reader +must spin, and this is why the reader can not preempt the writer. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-H->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +The following page will be made into the new head page. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-H->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +After the new head page has been set, we can set the old head page +pointer back to NORMAL. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |-H->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +After the head page has been moved, the tail page may now move forward. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |-H->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + + +The above are the trivial updates. Now for the more complex scenarios. + + +As stated before, if enough writes preempt the first write, the +tail page may make it all the way around the buffer and meet the commit +page. At this time, we must start dropping writes (usually with some kind +of warning to the user). But what happens if the commit was still on the +reader page? The commit page is not part of the ring buffer. The tail page +must account for this. + + + reader page commit page + | | + v | + +---+ | + | |<----------+ + | | + | |------+ + +---+ | + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-H->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + ^ + | + tail page + +If the tail page were to simply push the head page forward, the commit when +leaving the reader page would not be pointing to the correct page. + +The solution to this is to test if the commit page is on the reader page +before pushing the head page. If it is, then it can be assumed that the +tail page wrapped the buffer, and we must drop new writes. + +This is not a race condition, because the commit page can only be moved +by the outter most writer (the writer that was preempted). +This means that the commit will not move while a writer is moving the +tail page. The reader can not swap the reader page if it is also being +used as the commit page. The reader can simply check that the commit +is off the reader page. Once the commit page leaves the reader page +it will never go back on it unless a reader does another swap with the +buffer page that is also the commit page. + + +Nested writes +------------- + +In the pushing forward of the tail page we must first push forward +the head page if the head page is the next page. If the head page +is not the next page, the tail page is simply updated with a cmpxchg. + +Only writers move the tail page. This must be done atomically to protect +against nested writers. + + temp_page = tail_page + next_page = temp_page->next + cmpxchg(tail_page, temp_page, next_page) + +The above will update the tail page if it is still pointing to the expected +page. If this fails, a nested write pushed it forward, the the current write +does not need to push it. + + + temp page + | + v + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +Nested write comes in and moves the tail page forward: + + tail page (moved by nested writer) + temp page | + | | + v v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +The above would fail the cmpxchg, but since the tail page has already +been moved forward, the writer will just try again to reserve storage +on the new tail page. + +But the moving of the head page is a bit more complex. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-H->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +The write converts the head page pointer to UPDATE. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +But if a nested writer preempts here. It will see that the next +page is a head page, but it is also nested. It will detect that +it is nested and will save that information. The detection is the +fact that it sees the UPDATE flag instead of a HEADER or NORMAL +pointer. + +The nested writer will set the new head page pointer. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-H->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +But it will not reset the update back to normal. Only the writer +that converted a pointer from HEAD to UPDATE will convert it back +to NORMAL. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-H->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +After the nested writer finishes, the outer most writer will convert +the UPDATE pointer to NORMAL. + + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |-H->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + + +It can be even more complex if several nested writes came in and moved +the tail page ahead several pages: + + +(first writer) + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-H->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +The write converts the head page pointer to UPDATE. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |--->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +Next writer comes in, and sees the update and sets up the new +head page. + +(second writer) + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-H->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +The nested writer moves the tail page forward. But does not set the old +update page to NORMAL because it is not the outer most writer. + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-H->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +Another writer preempts and sees the page after the tail page is a head page. +It changes it from HEAD to UPDATE. + +(third writer) + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-U->| |---> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +The writer will move the head page forward: + + +(third writer) + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-U->| |-H-> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +But now that the third writer did change the HEAD flag to UPDATE it +will convert it to normal: + + +(third writer) + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |--->| |-H-> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + + +Then it will move the tail page, and return back to the second writer. + + +(second writer) + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |--->| |-H-> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + + +The second writer will fail to move the tail page because it was already +moved, so it will try again and add its data to the new tail page. +It will return to the first writer. + + +(first writer) + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |--->| |-H-> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +The first writer can not know atomically test if the tail page moved +while it updates the HEAD page. It will then update the head page to +what it thinks is the new head page. + + +(first writer) + + tail page + | + v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-H->| |-H-> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +Since the cmpxchg returns the old value of the pointer the first writer +will see it succeeded in updating the pointer from NORMAL to HEAD. +But as we can see, this is not good enough. It must also check to see +if the tail page is either where it use to be or on the next page: + + +(first writer) + + A B tail page + | | | + v v v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |-H->| |-H-> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +If tail page != A and tail page does not equal B, then it must reset the +pointer back to NORMAL. The fact that it only needs to worry about +nested writers, it only needs to check this after setting the HEAD page. + + +(first writer) + + A B tail page + | | | + v v v + +---+ +---+ +---+ +---+ +<---| |--->| |-U->| |--->| |-H-> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + +Now the writer can update the head page. This is also why the head page must +remain in UPDATE and only reset by the outer most writer. This prevents +the reader from seeing the incorrect head page. + + +(first writer) + + A B tail page + | | | + v v v + +---+ +---+ +---+ +---+ +<---| |--->| |--->| |--->| |-H-> +--->| |<---| |<---| |<---| |<--- + +---+ +---+ +---+ +---+ + -- cgit v1.2.3 From c31d96338a6041520ba5f1b6a4a5012ef00686b3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:37 +0200 Subject: x86: mce: Make CONFIG_X86_ANCIENT_MCE dependent on CONFIG_X86_MCE Add a missing depency for ANCIENT_MCE. It didn't matter in practice because the ANCIENT code wasn't compiled without X86_MCE, but it's better to express that clearly in Kconfig. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 356d2ec8e2fb..5962b872a7ad 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -823,7 +823,7 @@ config X86_MCE_AMD config X86_ANCIENT_MCE def_bool n - depends on X86_32 + depends on X86_32 && X86_MCE prompt "Support for old Pentium 5 / WinChip machine checks" ---help--- Include support for machine check handling on old Pentium 5 or WinChip -- cgit v1.2.3 From bab9bc6583fe6c1660d6ed36dd14bbb4edfaf393 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:38 +0200 Subject: x86: mce: Update X86_MCE description in x86/Kconfig - Clarify that this config controls thermal throttling reporting too - Clarify the types of errors reported by machine checks - Drop references to ancient CPUs. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5962b872a7ad..134a8c0d80dd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -774,20 +774,12 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS increased on these systems. config X86_MCE - bool "Machine Check Exception" + bool "Machine Check / overheating reporting" ---help--- - Machine Check Exception support allows the processor to notify the - kernel if it detects a problem (e.g. overheating, component failure). + Machine Check support allows the processor to notify the + kernel if it detects a problem (e.g. overheating, data corruption). The action the kernel takes depends on the severity of the problem, - ranging from a warning message on the console, to halting the machine. - Your processor must be a Pentium or newer to support this - check the - flags in /proc/cpuinfo for mce. Note that some older Pentium systems - have a design flaw which leads to false MCE events - hence MCE is - disabled on all P5 processors, unless explicitly enabled with "mce" - as a boot argument. Similarly, if MCE is built in and creates a - problem on some new non-standard machine, you can boot with "nomce" - to disable it. MCE support simply ignores non-MCE processors like - the 386 and 486, so nearly everyone can say Y here. + ranging from warning messages to halting the machine. config X86_OLD_MCE depends on X86_32 && X86_MCE -- cgit v1.2.3 From 5bb38adcb54cf7192b154368ad62982caa11ca0b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:39 +0200 Subject: x86: mce: Remove old i386 machine check code As announced in feature-remove-schedule.txt remove CONFIG_X86_OLD_MCE This patch only removes code. The ancient machine check code for very old systems that are not supported by CONFIG_X86_NEW_MCE is still kept. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- Documentation/feature-removal-schedule.txt | 10 -- arch/x86/Kconfig | 35 +------ arch/x86/include/asm/mce.h | 11 -- arch/x86/kernel/cpu/mcheck/Makefile | 2 - arch/x86/kernel/cpu/mcheck/k7.c | 116 -------------------- arch/x86/kernel/cpu/mcheck/mce.c | 47 --------- arch/x86/kernel/cpu/mcheck/non-fatal.c | 94 ----------------- arch/x86/kernel/cpu/mcheck/p4.c | 163 ----------------------------- arch/x86/kernel/cpu/mcheck/p6.c | 127 ---------------------- 9 files changed, 2 insertions(+), 603 deletions(-) delete mode 100644 arch/x86/kernel/cpu/mcheck/k7.c delete mode 100644 arch/x86/kernel/cpu/mcheck/non-fatal.c delete mode 100644 arch/x86/kernel/cpu/mcheck/p4.c delete mode 100644 arch/x86/kernel/cpu/mcheck/p6.c diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 7129846a2785..edb2f0b07616 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -444,13 +444,3 @@ What: CONFIG_RFKILL_INPUT When: 2.6.33 Why: Should be implemented in userspace, policy daemon. Who: Johannes Berg - ----------------------------- - -What: CONFIG_X86_OLD_MCE -When: 2.6.32 -Why: Remove the old legacy 32bit machine check code. This has been - superseded by the newer machine check code from the 64bit port, - but the old version has been kept around for easier testing. Note this - doesn't impact the old P5 and WinChip machine check handlers. -Who: Andi Kleen diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 134a8c0d80dd..d986769a7d90 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -781,21 +781,10 @@ config X86_MCE The action the kernel takes depends on the severity of the problem, ranging from warning messages to halting the machine. -config X86_OLD_MCE - depends on X86_32 && X86_MCE - bool "Use legacy machine check code (will go away)" - default n - select X86_ANCIENT_MCE - ---help--- - Use the old i386 machine check code. This is merely intended for - testing in a transition period. Try this if you run into any machine - check related software problems, but report the problem to - linux-kernel. When in doubt say no. - config X86_NEW_MCE depends on X86_MCE bool - default y if (!X86_OLD_MCE && X86_32) || X86_64 + default y config X86_MCE_INTEL def_bool y @@ -835,29 +824,9 @@ config X86_MCE_INJECT If you don't know what a machine check is and you don't do kernel QA it is safe to say n. -config X86_MCE_NONFATAL - tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" - depends on X86_OLD_MCE - ---help--- - Enabling this feature starts a timer that triggers every 5 seconds which - will look at the machine check registers to see if anything happened. - Non-fatal problems automatically get corrected (but still logged). - Disable this if you don't want to see these messages. - Seeing the messages this option prints out may be indicative of dying - or out-of-spec (ie, overclocked) hardware. - This option only does something on certain CPUs. - (AMD Athlon/Duron and Intel Pentium 4) - -config X86_MCE_P4THERMAL - bool "check for P4 thermal throttling interrupt." - depends on X86_OLD_MCE && X86_MCE && (X86_UP_APIC || SMP) - ---help--- - Enabling this feature will cause a message to be printed when the P4 - enters thermal throttling. - config X86_THERMAL_VECTOR def_bool y - depends on X86_MCE_P4THERMAL || X86_MCE_INTEL + depends on X86_MCE_INTEL config VM86 bool "Enable VM86 support" if EMBEDDED diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index b50b9e9042c4..6b8a974e1270 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -115,13 +115,6 @@ void mcheck_init(struct cpuinfo_x86 *c); static inline void mcheck_init(struct cpuinfo_x86 *c) {} #endif -#ifdef CONFIG_X86_OLD_MCE -extern int nr_mce_banks; -void amd_mcheck_init(struct cpuinfo_x86 *c); -void intel_p4_mcheck_init(struct cpuinfo_x86 *c); -void intel_p6_mcheck_init(struct cpuinfo_x86 *c); -#endif - #ifdef CONFIG_X86_ANCIENT_MCE void intel_p5_mcheck_init(struct cpuinfo_x86 *c); void winchip_mcheck_init(struct cpuinfo_x86 *c); @@ -208,11 +201,7 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); void intel_init_thermal(struct cpuinfo_x86 *c); -#ifdef CONFIG_X86_NEW_MCE void mce_log_therm_throt_event(__u64 status); -#else -static inline void mce_log_therm_throt_event(__u64 status) {} -#endif #endif /* __KERNEL__ */ #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 188a1ca5ad2b..022a036ce21b 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,11 +1,9 @@ obj-y = mce.o obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o -obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o -obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c deleted file mode 100644 index b945d5dbc609..000000000000 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Athlon specific Machine Check Exception Reporting - * (C) Copyright 2002 Dave Jones - */ -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -/* Machine Check Handler For AMD Athlon/Duron: */ -static void k7_machine_check(struct pt_regs *regs, long error_code) -{ - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int recover = 1; - int i; - - rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover = 0; - - printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - for (i = 1; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - char misc[20]; - char addr[24]; - - misc[0] = '\0'; - addr[0] = '\0'; - - if (high & (1<<29)) - recover |= 1; - if (high & (1<<25)) - recover |= 2; - high &= ~(1<<31); - - if (high & (1<<27)) { - rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); - snprintf(misc, 20, "[%08x%08x]", ahigh, alow); - } - if (high & (1<<26)) { - rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - snprintf(addr, 24, " at %08x%08x", ahigh, alow); - } - - printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", - smp_processor_id(), i, high, low, misc, addr); - - /* Clear it: */ - wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); - /* Serialize: */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - } - - if (recover & 2) - panic("CPU context corrupt"); - if (recover & 1) - panic("Unable to continue"); - - printk(KERN_EMERG "Attempting to continue.\n"); - - mcgstl &= ~(1<<2); - wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - - -/* AMD K7 machine check is Intel like: */ -void amd_mcheck_init(struct cpuinfo_x86 *c) -{ - u32 l, h; - int i; - - if (!cpu_has(c, X86_FEATURE_MCE)) - return; - - machine_check_vector = k7_machine_check; - /* Make sure the vector pointer is visible before we enable MCEs: */ - wmb(); - - printk(KERN_INFO "Intel machine check architecture supported.\n"); - - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<8)) /* Control register present ? */ - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - nr_mce_banks = l & 0xff; - - /* - * Clear status for MC index 0 separately, we don't touch CTL, - * as some K7 Athlons cause spurious MCEs when its enabled: - */ - if (boot_cpu_data.x86 == 6) { - wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); - i = 1; - } else - i = 0; - - for (; i < nr_mce_banks; i++) { - wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); - } - - set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", - smp_processor_id()); -} diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 7da8fec9ca88..5ff6362ecb18 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -58,8 +58,6 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) = int mce_disabled __read_mostly; -#ifdef CONFIG_X86_NEW_MCE - #define MISC_MCELOG_MINOR 227 #define SPINUNIT 100 /* 100ns */ @@ -1993,51 +1991,6 @@ static __init int mce_init_device(void) device_initcall(mce_init_device); -#else /* CONFIG_X86_OLD_MCE: */ - -int nr_mce_banks; -EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ - -/* This has to be run for each processor */ -void mcheck_init(struct cpuinfo_x86 *c) -{ - if (mce_disabled) - return; - - switch (c->x86_vendor) { - case X86_VENDOR_AMD: - amd_mcheck_init(c); - break; - - case X86_VENDOR_INTEL: - if (c->x86 == 5) - intel_p5_mcheck_init(c); - if (c->x86 == 6) - intel_p6_mcheck_init(c); - if (c->x86 == 15) - intel_p4_mcheck_init(c); - break; - - case X86_VENDOR_CENTAUR: - if (c->x86 == 5) - winchip_mcheck_init(c); - break; - - default: - break; - } - printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); -} - -static int __init mcheck_enable(char *str) -{ - mce_p5_enabled = 1; - return 1; -} -__setup("mce", mcheck_enable); - -#endif /* CONFIG_X86_OLD_MCE */ - /* * Old style boot options parsing. Only for compatibility. */ diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c deleted file mode 100644 index f5f2d6f71fb6..000000000000 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Non Fatal Machine Check Exception Reporting - * - * (C) Copyright 2002 Dave Jones. - * - * This file contains routines to check for non-fatal MCEs every 15s - * - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -static int firstbank; - -#define MCE_RATE (15*HZ) /* timer rate is 15s */ - -static void mce_checkregs(void *info) -{ - u32 low, high; - int i; - - for (i = firstbank; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - - if (!(high & (1<<31))) - continue; - - printk(KERN_INFO "MCE: The hardware reports a non fatal, " - "correctable incident occurred on CPU %d.\n", - smp_processor_id()); - - printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); - - /* - * Scrub the error so we don't pick it up in MCE_RATE - * seconds time: - */ - wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); - - /* Serialize: */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } -} - -static void mce_work_fn(struct work_struct *work); -static DECLARE_DELAYED_WORK(mce_work, mce_work_fn); - -static void mce_work_fn(struct work_struct *work) -{ - on_each_cpu(mce_checkregs, NULL, 1); - schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); -} - -static int __init init_nonfatal_mce_checker(void) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - - /* Check for MCE support */ - if (!cpu_has(c, X86_FEATURE_MCE)) - return -ENODEV; - - /* Check for PPro style MCA */ - if (!cpu_has(c, X86_FEATURE_MCA)) - return -ENODEV; - - /* Some Athlons misbehave when we frob bank 0 */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 == 6) - firstbank = 1; - else - firstbank = 0; - - /* - * Check for non-fatal errors every MCE_RATE s - */ - schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); - printk(KERN_INFO "Machine check exception polling timer started.\n"); - - return 0; -} -module_init(init_nonfatal_mce_checker); - -MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c deleted file mode 100644 index 4482aea9aa2e..000000000000 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ /dev/null @@ -1,163 +0,0 @@ -/* - * P4 specific Machine Check Exception Reporting - */ -#include -#include -#include -#include - -#include -#include -#include - -/* as supported by the P4/Xeon family */ -struct intel_mce_extended_msrs { - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - u32 esi; - u32 edi; - u32 ebp; - u32 esp; - u32 eflags; - u32 eip; - /* u32 *reserved[]; */ -}; - -static int mce_num_extended_msrs; - -/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ -static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) -{ - u32 h; - - rdmsr(MSR_IA32_MCG_EAX, r->eax, h); - rdmsr(MSR_IA32_MCG_EBX, r->ebx, h); - rdmsr(MSR_IA32_MCG_ECX, r->ecx, h); - rdmsr(MSR_IA32_MCG_EDX, r->edx, h); - rdmsr(MSR_IA32_MCG_ESI, r->esi, h); - rdmsr(MSR_IA32_MCG_EDI, r->edi, h); - rdmsr(MSR_IA32_MCG_EBP, r->ebp, h); - rdmsr(MSR_IA32_MCG_ESP, r->esp, h); - rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h); - rdmsr(MSR_IA32_MCG_EIP, r->eip, h); -} - -static void intel_machine_check(struct pt_regs *regs, long error_code) -{ - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int recover = 1; - int i; - - rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover = 0; - - printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - if (mce_num_extended_msrs > 0) { - struct intel_mce_extended_msrs dbg; - - intel_get_extended_msrs(&dbg); - - printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" - "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" - "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", - smp_processor_id(), dbg.eip, dbg.eflags, - dbg.eax, dbg.ebx, dbg.ecx, dbg.edx, - dbg.esi, dbg.edi, dbg.ebp, dbg.esp); - } - - for (i = 0; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - char misc[20]; - char addr[24]; - - misc[0] = addr[0] = '\0'; - if (high & (1<<29)) - recover |= 1; - if (high & (1<<25)) - recover |= 2; - high &= ~(1<<31); - if (high & (1<<27)) { - rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); - snprintf(misc, 20, "[%08x%08x]", ahigh, alow); - } - if (high & (1<<26)) { - rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - snprintf(addr, 24, " at %08x%08x", ahigh, alow); - } - printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", - smp_processor_id(), i, high, low, misc, addr); - } - } - - if (recover & 2) - panic("CPU context corrupt"); - if (recover & 1) - panic("Unable to continue"); - - printk(KERN_EMERG "Attempting to continue.\n"); - - /* - * Do not clear the MSR_IA32_MCi_STATUS if the error is not - * recoverable/continuable.This will allow BIOS to look at the MSRs - * for errors if the OS could not log the error. - */ - for (i = 0; i < nr_mce_banks; i++) { - u32 msr; - msr = MSR_IA32_MC0_STATUS+i*4; - rdmsr(msr, low, high); - if (high&(1<<31)) { - /* Clear it */ - wrmsr(msr, 0UL, 0UL); - /* Serialize */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - } - mcgstl &= ~(1<<2); - wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - -void intel_p4_mcheck_init(struct cpuinfo_x86 *c) -{ - u32 l, h; - int i; - - machine_check_vector = intel_machine_check; - wmb(); - - printk(KERN_INFO "Intel machine check architecture supported.\n"); - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<8)) /* Control register present ? */ - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - nr_mce_banks = l & 0xff; - - for (i = 0; i < nr_mce_banks; i++) { - wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); - } - - set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", - smp_processor_id()); - - /* Check for P4/Xeon extended MCE MSRs */ - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<9)) {/* MCG_EXT_P */ - mce_num_extended_msrs = (l >> 16) & 0xff; - printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)" - " available\n", - smp_processor_id(), mce_num_extended_msrs); - -#ifdef CONFIG_X86_MCE_P4THERMAL - /* Check for P4/Xeon Thermal monitor */ - intel_init_thermal(c); -#endif - } -} diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c deleted file mode 100644 index 01e4f8178183..000000000000 --- a/arch/x86/kernel/cpu/mcheck/p6.c +++ /dev/null @@ -1,127 +0,0 @@ -/* - * P6 specific Machine Check Exception Reporting - * (C) Copyright 2002 Alan Cox - */ -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -/* Machine Check Handler For PII/PIII */ -static void intel_machine_check(struct pt_regs *regs, long error_code) -{ - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int recover = 1; - int i; - - rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover = 0; - - printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - for (i = 0; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - char misc[20]; - char addr[24]; - - misc[0] = '\0'; - addr[0] = '\0'; - - if (high & (1<<29)) - recover |= 1; - if (high & (1<<25)) - recover |= 2; - high &= ~(1<<31); - - if (high & (1<<27)) { - rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); - snprintf(misc, 20, "[%08x%08x]", ahigh, alow); - } - if (high & (1<<26)) { - rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - snprintf(addr, 24, " at %08x%08x", ahigh, alow); - } - - printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", - smp_processor_id(), i, high, low, misc, addr); - } - } - - if (recover & 2) - panic("CPU context corrupt"); - if (recover & 1) - panic("Unable to continue"); - - printk(KERN_EMERG "Attempting to continue.\n"); - /* - * Do not clear the MSR_IA32_MCi_STATUS if the error is not - * recoverable/continuable.This will allow BIOS to look at the MSRs - * for errors if the OS could not log the error: - */ - for (i = 0; i < nr_mce_banks; i++) { - unsigned int msr; - - msr = MSR_IA32_MC0_STATUS+i*4; - rdmsr(msr, low, high); - if (high & (1<<31)) { - /* Clear it: */ - wrmsr(msr, 0UL, 0UL); - /* Serialize: */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - } - mcgstl &= ~(1<<2); - wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - -/* Set up machine check reporting for processors with Intel style MCE: */ -void intel_p6_mcheck_init(struct cpuinfo_x86 *c) -{ - u32 l, h; - int i; - - /* Check for MCE support */ - if (!cpu_has(c, X86_FEATURE_MCE)) - return; - - /* Check for PPro style MCA */ - if (!cpu_has(c, X86_FEATURE_MCA)) - return; - - /* Ok machine check is available */ - machine_check_vector = intel_machine_check; - /* Make sure the vector pointer is visible before we enable MCEs: */ - wmb(); - - printk(KERN_INFO "Intel machine check architecture supported.\n"); - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<8)) /* Control register present ? */ - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - nr_mce_banks = l & 0xff; - - /* - * Following the example in IA-32 SDM Vol 3: - * - MC0_CTL should not be written - * - Status registers on all banks should be cleared on reset - */ - for (i = 1; i < nr_mce_banks; i++) - wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - - for (i = 0; i < nr_mce_banks; i++) - wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); - - set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", - smp_processor_id()); -} -- cgit v1.2.3 From c1ebf835617035b1f08f734247dcb981e17aac6b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:41 +0200 Subject: x86: mce: Rename CONFIG_X86_NEW_MCE to CONFIG_X86_MCE Drop the CONFIG_X86_NEW_MCE symbol and change all references to it to check for CONFIG_X86_MCE directly. No code changes Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 11 +++-------- arch/x86/include/asm/entry_arch.h | 2 +- arch/x86/kernel/apic/nmi.c | 2 +- arch/x86/kernel/cpu/mcheck/Makefile | 3 +-- arch/x86/kernel/irq.c | 4 ++-- arch/x86/kernel/irqinit.c | 2 +- arch/x86/kernel/signal.c | 2 +- 7 files changed, 10 insertions(+), 16 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d986769a7d90..06880ca677f4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -781,15 +781,10 @@ config X86_MCE The action the kernel takes depends on the severity of the problem, ranging from warning messages to halting the machine. -config X86_NEW_MCE - depends on X86_MCE - bool - default y - config X86_MCE_INTEL def_bool y prompt "Intel MCE features" - depends on X86_NEW_MCE && X86_LOCAL_APIC + depends on X86_MCE && X86_LOCAL_APIC ---help--- Additional support for intel specific MCE features such as the thermal monitor. @@ -797,7 +792,7 @@ config X86_MCE_INTEL config X86_MCE_AMD def_bool y prompt "AMD MCE features" - depends on X86_NEW_MCE && X86_LOCAL_APIC + depends on X86_MCE && X86_LOCAL_APIC ---help--- Additional support for AMD specific MCE features such as the DRAM Error Threshold. @@ -817,7 +812,7 @@ config X86_MCE_THRESHOLD default y config X86_MCE_INJECT - depends on X86_NEW_MCE + depends on X86_MCE tristate "Machine check injector support" ---help--- Provide support for injecting machine checks for testing purposes. diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index ff8cbfa07851..5e3f2044f0d3 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -61,7 +61,7 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR) #endif diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index b3025b43b63a..f4227289caf7 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu) static inline int mce_in_progress(void) { -#if defined(CONFIG_X86_NEW_MCE) +#if defined(CONFIG_X86_MCE) return atomic_read(&mce_entry) > 0; #endif return 0; diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 022a036ce21b..4ac6d48fe11b 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,6 +1,5 @@ -obj-y = mce.o +obj-y = mce.o mce-severity.o -obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index b0cdde6932f5..74656d1d4e30 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -104,7 +104,7 @@ static int show_other_interrupts(struct seq_file *p, int prec) seq_printf(p, " Threshold APIC interrupts\n"); # endif #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE seq_printf(p, "%*s: ", prec, "MCE"); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); @@ -200,7 +200,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->irq_threshold_count; # endif #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE sum += per_cpu(mce_exception_count, cpu); sum += per_cpu(mce_poll_count, cpu); #endif diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 696f0e475c2d..8a194ad357ed 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -190,7 +190,7 @@ static void __init apic_intr_init(void) #ifdef CONFIG_X86_THRESHOLD alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); #endif -#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) +#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC) alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt); #endif diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 4c578751e94e..cc26ad4c3070 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -856,7 +856,7 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) mce_notify_process(); -- cgit v1.2.3 From 9eda8cb3ac235217e4ffa01cb9cedee1c1550599 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:42 +0200 Subject: x86: mce: Move code in mce.c Now that the X86_OLD_MCE ifdefs are gone move some code that used to be outside the big ifdef to a more natural place near its user. No code change. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5ff6362ecb18..e16271f01ac4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -45,17 +45,6 @@ #include "mce-internal.h" -/* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs, long error_code) -{ - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", - smp_processor_id()); -} - -/* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = - unexpected_machine_check; - int mce_disabled __read_mostly; #define MISC_MCELOG_MINOR 227 @@ -1322,6 +1311,17 @@ static void mce_init_timer(void) add_timer(t); } +/* Handle unconfigured int18 (should never happen) */ +static void unexpected_machine_check(struct pt_regs *regs, long error_code) +{ + printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + smp_processor_id()); +} + +/* Call the installed machine check handler for this CPU setup. */ +void (*machine_check_vector)(struct pt_regs *, long error_code) = + unexpected_machine_check; + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off: -- cgit v1.2.3 From cebe182033f156b430952370fb0f9dbe6e89b081 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:43 +0200 Subject: x86: mce: Move per bank data in a single datastructure This addresses one of the leftover review comments. Move the per bank data into a single structure. This avoids several separate variables and also separate allocation of sysfs objects. I didn't move the CMCI ownership information so far because that would have needed some non trivial changes in the algorithms. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce-internal.h | 14 ++++ arch/x86/kernel/cpu/mcheck/mce.c | 109 +++++++++++++++--------------- 2 files changed, 67 insertions(+), 56 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 54dcb8ff12e5..6bd51e7ba87b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -1,3 +1,4 @@ +#include #include enum severity_level { @@ -10,6 +11,19 @@ enum severity_level { MCE_PANIC_SEVERITY, }; +#define ATTR_LEN 16 + +/* One object for each MCE bank, shared by all CPUs */ +struct mce_bank { + u64 ctl; /* subevents to enable */ + unsigned char init; /* initialise bank? */ + struct sysdev_attribute attr; /* sysdev attribute */ + char attrname[ATTR_LEN]; /* attribute name */ +}; + int mce_severity(struct mce *a, int tolerant, char **msg); extern int mce_ser; + +extern struct mce_bank *mce_banks; + diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e16271f01ac4..a04806e01a82 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -64,7 +64,6 @@ DEFINE_PER_CPU(unsigned, mce_exception_count); */ static int tolerant __read_mostly = 1; static int banks __read_mostly; -static u64 *bank __read_mostly; static int rip_msr __read_mostly; static int mce_bootlog __read_mostly = -1; static int monarch_timeout __read_mostly = -1; @@ -74,13 +73,13 @@ int mce_cmci_disabled __read_mostly; int mce_ignore_ce __read_mostly; int mce_ser __read_mostly; +struct mce_bank *mce_banks __read_mostly; + /* User mode helper program triggered by machine check event */ static unsigned long mce_need_notify; static char mce_helper[128]; static char *mce_helper_argv[2] = { mce_helper, NULL }; -static unsigned long dont_init_banks; - static DECLARE_WAIT_QUEUE_HEAD(mce_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; @@ -91,11 +90,6 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL }; -static inline int skip_bank_init(int i) -{ - return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); -} - static DEFINE_PER_CPU(struct work_struct, mce_work); /* Do initial initialization of a struct mce */ @@ -482,7 +476,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); for (i = 0; i < banks; i++) { - if (!bank[i] || !test_bit(i, *b)) + if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; m.misc = 0; @@ -903,7 +897,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) order = mce_start(&no_way_out); for (i = 0; i < banks; i++) { __clear_bit(i, toclear); - if (!bank[i]) + if (!mce_banks[i].ctl) continue; m.misc = 0; @@ -1146,6 +1140,21 @@ int mce_notify_irq(void) } EXPORT_SYMBOL_GPL(mce_notify_irq); +static int mce_banks_init(void) +{ + int i; + + mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); + if (!mce_banks) + return -ENOMEM; + for (i = 0; i < banks; i++) { + struct mce_bank *b = &mce_banks[i]; + b->ctl = -1ULL; + b->init = 1; + } + return 0; +} + /* * Initialize Machine Checks for a CPU. */ @@ -1169,11 +1178,10 @@ static int mce_cap_init(void) /* Don't support asymmetric configurations today */ WARN_ON(banks != 0 && b != banks); banks = b; - if (!bank) { - bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); - if (!bank) - return -ENOMEM; - memset(bank, 0xff, banks * sizeof(u64)); + if (!mce_banks) { + int err = mce_banks_init(); + if (err) + return err; } /* Use accurate RIP reporting if available. */ @@ -1205,9 +1213,10 @@ static void mce_init(void) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); for (i = 0; i < banks; i++) { - if (skip_bank_init(i)) + struct mce_bank *b = &mce_banks[i]; + if (!b->init) continue; - wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); + wrmsrl(MSR_IA32_MC0_CTL+4*i, b->ctl); wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } } @@ -1223,7 +1232,7 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) * trips off incorrectly with the IOMMU & 3ware * & Cerberus: */ - clear_bit(10, (unsigned long *)&bank[4]); + clear_bit(10, (unsigned long *)&mce_banks[4].ctl); } if (c->x86 <= 17 && mce_bootlog < 0) { /* @@ -1237,7 +1246,7 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) * by default. */ if (c->x86 == 6 && banks > 0) - bank[0] = 0; + mce_banks[0].ctl = 0; } if (c->x86_vendor == X86_VENDOR_INTEL) { @@ -1250,8 +1259,8 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) * valid event later, merely don't write CTL0. */ - if (c->x86 == 6 && c->x86_model < 0x1A) - __set_bit(0, &dont_init_banks); + if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) + mce_banks[0].init = 0; /* * All newer Intel systems support MCE broadcasting. Enable @@ -1578,7 +1587,8 @@ static int mce_disable(void) int i; for (i = 0; i < banks; i++) { - if (!skip_bank_init(i)) + struct mce_bank *b = &mce_banks[i]; + if (b->init) wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); } return 0; @@ -1654,14 +1664,15 @@ DEFINE_PER_CPU(struct sys_device, mce_dev); __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); -static struct sysdev_attribute *bank_attrs; +static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) +{ + return container_of(attr, struct mce_bank, attr); +} static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, char *buf) { - u64 b = bank[attr - bank_attrs]; - - return sprintf(buf, "%llx\n", b); + return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); } static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, @@ -1672,7 +1683,7 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, if (strict_strtoull(buf, 0, &new) < 0) return -EINVAL; - bank[attr - bank_attrs] = new; + attr_to_bank(attr)->ctl = new; mce_restart(); return size; @@ -1816,7 +1827,7 @@ static __cpuinit int mce_create_device(unsigned int cpu) } for (j = 0; j < banks; j++) { err = sysdev_create_file(&per_cpu(mce_dev, cpu), - &bank_attrs[j]); + &mce_banks[j].attr); if (err) goto error2; } @@ -1825,10 +1836,10 @@ static __cpuinit int mce_create_device(unsigned int cpu) return 0; error2: while (--j >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); error: while (--i >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); sysdev_unregister(&per_cpu(mce_dev, cpu)); @@ -1846,7 +1857,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu) sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); for (i = 0; i < banks; i++) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); sysdev_unregister(&per_cpu(mce_dev, cpu)); cpumask_clear_cpu(cpu, mce_dev_initialized); @@ -1863,7 +1874,8 @@ static void mce_disable_cpu(void *h) if (!(action & CPU_TASKS_FROZEN)) cmci_clear(); for (i = 0; i < banks; i++) { - if (!skip_bank_init(i)) + struct mce_bank *b = &mce_banks[i]; + if (b->init) wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); } } @@ -1879,8 +1891,9 @@ static void mce_reenable_cpu(void *h) if (!(action & CPU_TASKS_FROZEN)) cmci_reenable(); for (i = 0; i < banks; i++) { - if (!skip_bank_init(i)) - wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); + struct mce_bank *b = &mce_banks[i]; + if (b->init) + wrmsrl(MSR_IA32_MC0_CTL + i*4, b->ctl); } } @@ -1928,35 +1941,21 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = { .notifier_call = mce_cpu_callback, }; -static __init int mce_init_banks(void) +static __init void mce_init_banks(void) { int i; - bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, - GFP_KERNEL); - if (!bank_attrs) - return -ENOMEM; - for (i = 0; i < banks; i++) { - struct sysdev_attribute *a = &bank_attrs[i]; + struct mce_bank *b = &mce_banks[i]; + struct sysdev_attribute *a = &b->attr; - a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); - if (!a->attr.name) - goto nomem; + a->attr.name = b->attrname; + snprintf(b->attrname, ATTR_LEN, "bank%d", i); a->attr.mode = 0644; a->show = show_bank; a->store = set_bank; } - return 0; - -nomem: - while (--i >= 0) - kfree(bank_attrs[i].attr.name); - kfree(bank_attrs); - bank_attrs = NULL; - - return -ENOMEM; } static __init int mce_init_device(void) @@ -1969,9 +1968,7 @@ static __init int mce_init_device(void) zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); - err = mce_init_banks(); - if (err) - return err; + mce_init_banks(); err = sysdev_class_register(&mce_sysclass); if (err) -- cgit v1.2.3 From a2d32bcbc008aa0f9c301a7c6f3494cb23e6af54 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:44 +0200 Subject: x86: mce: macros to compute banks MSRs Instead of open coded calculations for bank MSRs hide the indexing of higher banks MCE register MSRs in new macros. No semantic changes. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr-index.h | 7 +++++++ arch/x86/kernel/cpu/mcheck/mce.c | 34 +++++++++++++++++----------------- arch/x86/kernel/cpu/mcheck/mce_intel.c | 10 +++++----- 3 files changed, 29 insertions(+), 22 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1692fb5050e3..3d1ce094586a 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -81,8 +81,15 @@ #define MSR_IA32_MC0_ADDR 0x00000402 #define MSR_IA32_MC0_MISC 0x00000403 +#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) +#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x)) +#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x)) +#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x)) + /* These are consecutive and not in the normal 4er MCE bank block */ #define MSR_IA32_MC0_CTL2 0x00000280 +#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) + #define CMCI_EN (1ULL << 30) #define CMCI_THRESHOLD_MASK 0xffffULL diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a04806e01a82..07139a0578e3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -267,11 +267,11 @@ static int msr_to_offset(u32 msr) unsigned bank = __get_cpu_var(injectm.bank); if (msr == rip_msr) return offsetof(struct mce, ip); - if (msr == MSR_IA32_MC0_STATUS + bank*4) + if (msr == MSR_IA32_MCx_STATUS(bank)) return offsetof(struct mce, status); - if (msr == MSR_IA32_MC0_ADDR + bank*4) + if (msr == MSR_IA32_MCx_ADDR(bank)) return offsetof(struct mce, addr); - if (msr == MSR_IA32_MC0_MISC + bank*4) + if (msr == MSR_IA32_MCx_MISC(bank)) return offsetof(struct mce, misc); if (msr == MSR_IA32_MCG_STATUS) return offsetof(struct mce, mcgstatus); @@ -485,7 +485,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.tsc = 0; barrier(); - m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); if (!(m.status & MCI_STATUS_VAL)) continue; @@ -500,9 +500,9 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) continue; if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); + m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); + m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); if (!(flags & MCP_TIMESTAMP)) m.tsc = 0; @@ -518,7 +518,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) /* * Clear state for this bank. */ - mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } /* @@ -539,7 +539,7 @@ static int mce_no_way_out(struct mce *m, char **msg) int i; for (i = 0; i < banks; i++) { - m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) return 1; } @@ -823,7 +823,7 @@ static void mce_clear_state(unsigned long *toclear) for (i = 0; i < banks; i++) { if (test_bit(i, toclear)) - mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } } @@ -904,7 +904,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) m.addr = 0; m.bank = i; - m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); if ((m.status & MCI_STATUS_VAL) == 0) continue; @@ -945,9 +945,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) kill_it = 1; if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); + m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); + m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); /* * Action optional error. Queue address for later processing. @@ -1216,8 +1216,8 @@ static void mce_init(void) struct mce_bank *b = &mce_banks[i]; if (!b->init) continue; - wrmsrl(MSR_IA32_MC0_CTL+4*i, b->ctl); - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); + wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } } @@ -1589,7 +1589,7 @@ static int mce_disable(void) for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + wrmsrl(MSR_IA32_MCx_CTL(i), 0); } return 0; } @@ -1876,7 +1876,7 @@ static void mce_disable_cpu(void *h) for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + wrmsrl(MSR_IA32_MCx_CTL(i), 0); } } @@ -1893,7 +1893,7 @@ static void mce_reenable_cpu(void *h) for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(MSR_IA32_MC0_CTL + i*4, b->ctl); + wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); } } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index e1acec0f7a32..889f665fe93d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -90,7 +90,7 @@ static void cmci_discover(int banks, int boot) if (test_bit(i, owned)) continue; - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Already owned by someone else? */ if (val & CMCI_EN) { @@ -101,8 +101,8 @@ static void cmci_discover(int banks, int boot) } val |= CMCI_EN | CMCI_THRESHOLD; - wrmsrl(MSR_IA32_MC0_CTL2 + i, val); - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + wrmsrl(MSR_IA32_MCx_CTL2(i), val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Did the enable bit stick? -- the bank supports CMCI */ if (val & CMCI_EN) { @@ -152,9 +152,9 @@ void cmci_clear(void) if (!test_bit(i, __get_cpu_var(mce_banks_owned))) continue; /* Disable CMCI */ - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); - wrmsrl(MSR_IA32_MC0_CTL2 + i, val); + wrmsrl(MSR_IA32_MCx_CTL2(i), val); __clear_bit(i, __get_cpu_var(mce_banks_owned)); } spin_unlock_irqrestore(&cmci_discover_lock, flags); -- cgit v1.2.3 From 3ccdccfadbd2548abe38682b587f4ba27eac2fc9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:45 +0200 Subject: x86: mce: Lower maximum number of banks to architecture limit The Intel x86 architecture right now only supports 32 machine check banks, more would bump into other MSRs. So lower the max define to 32. This only affects a few bitmaps, most data structures are dynamically sized anyways. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 6b8a974e1270..ad7535372918 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -130,10 +130,11 @@ void mce_log(struct mce *m); DECLARE_PER_CPU(struct sys_device, mce_dev); /* - * To support more than 128 would need to escape the predefined - * Linux defined extended banks first. + * Maximum banks number. + * This is the limit of the current register layout on + * Intel CPUs. */ -#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) +#define MAX_NR_BANKS 32 #ifdef CONFIG_X86_MCE_INTEL extern int mce_cmci_disabled; -- cgit v1.2.3 From c5cb183608167c744cb28bbd85884be5a4ce875d Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 9 Jul 2009 16:20:12 +0800 Subject: tracing/filter: Remove preds from struct event_subsystem No need to save preds to event_subsystem, because it's not used. Signed-off-by: Xiao Guangrong Acked-by: Tom Zanussi Reviewed-by: Li Zefan Cc: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <4A55A83C.1030005@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 39 ++++++-------------------------------- 1 file changed, 6 insertions(+), 33 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 936c621bbf46..b9aae72d13db 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -420,17 +420,7 @@ EXPORT_SYMBOL_GPL(init_preds); static void filter_free_subsystem_preds(struct event_subsystem *system) { - struct event_filter *filter = system->filter; struct ftrace_event_call *call; - int i; - - if (filter->n_preds) { - for (i = 0; i < filter->n_preds; i++) - filter_free_pred(filter->preds[i]); - kfree(filter->preds); - filter->preds = NULL; - filter->n_preds = 0; - } list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) @@ -607,26 +597,9 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, struct filter_pred *pred, char *filter_string) { - struct event_filter *filter = system->filter; struct ftrace_event_call *call; int err = 0; - if (!filter->preds) { - filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), - GFP_KERNEL); - - if (!filter->preds) - return -ENOMEM; - } - - if (filter->n_preds == MAX_FILTER_PRED) { - parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); - return -ENOSPC; - } - - filter->preds[filter->n_preds] = pred; - filter->n_preds++; - list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) @@ -1029,12 +1002,12 @@ static int replace_preds(struct event_subsystem *system, if (elt->op == OP_AND || elt->op == OP_OR) { pred = create_logical_pred(elt->op); - if (call) { + if (call) err = filter_add_pred(ps, call, pred); - filter_free_pred(pred); - } else + else err = filter_add_subsystem_pred(ps, system, pred, filter_string); + filter_free_pred(pred); if (err) return err; @@ -1048,12 +1021,12 @@ static int replace_preds(struct event_subsystem *system, } pred = create_pred(elt->op, operand1, operand2); - if (call) { + if (call) err = filter_add_pred(ps, call, pred); - filter_free_pred(pred); - } else + else err = filter_add_subsystem_pred(ps, system, pred, filter_string); + filter_free_pred(pred); if (err) return err; -- cgit v1.2.3 From dc82ec98a4727fd51b77e92d05fe7d2db3dcc11c Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 9 Jul 2009 16:22:22 +0800 Subject: tracing/filter: Remove empty subsystem and its directory Remove empty subsystem and its directory when module unload. Before patch: # rmmod trace-events-sample.ko # ls sample enable filter After patch: # rmmod trace-events-sample.ko # ls sample ls: cannot access sample: No such file or directory Signed-off-by: Xiao Guangrong Acked-by: Tom Zanussi Reviewed-by: Li Zefan Cc: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <4A55A8BE.9010707@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 1 + kernel/trace/trace_events.c | 32 +++++++++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 52eb0d8dcd75..94305c7bc11c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -757,6 +757,7 @@ struct event_subsystem { const char *name; struct dentry *entry; void *filter; + int nr_events; }; struct filter_pred; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index fecac1314cbe..90cf9360e140 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -851,8 +851,10 @@ event_subsystem_dir(const char *name, struct dentry *d_events) /* First see if we did not already create this dir */ list_for_each_entry(system, &event_subsystems, list) { - if (strcmp(system->name, name) == 0) + if (strcmp(system->name, name) == 0) { + system->nr_events++; return system->entry; + } } /* need to create new entry */ @@ -871,6 +873,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) return d_events; } + system->nr_events = 1; system->name = kstrdup(name, GFP_KERNEL); if (!system->name) { debugfs_remove(system->entry); @@ -905,6 +908,32 @@ event_subsystem_dir(const char *name, struct dentry *d_events) return system->entry; } +static void remove_subsystem_dir(const char *name) +{ + struct event_subsystem *system; + + if (strcmp(name, TRACE_SYSTEM) == 0) + return; + + list_for_each_entry(system, &event_subsystems, list) { + if (strcmp(system->name, name) == 0) { + if (!--system->nr_events) { + struct event_filter *filter = system->filter; + + debugfs_remove_recursive(system->entry); + list_del(&system->list); + if (filter) { + kfree(filter->filter_string); + kfree(filter); + } + kfree(system->name); + kfree(system); + } + break; + } + } +} + static int event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, const struct file_operations *id, @@ -1079,6 +1108,7 @@ static void trace_module_remove_events(struct module *mod) list_del(&call->list); trace_destroy_fields(call); destroy_preds(call); + remove_subsystem_dir(call->system); } } -- cgit v1.2.3 From db59504d89db1462a5281fb55b1d962cb74a398f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 7 Jul 2009 13:52:36 +0800 Subject: ksym_tracer: Extract trace entry from struct trace_ksym struct trace_ksym is used as an entry in hbp list, and is also used as trace_entry stored in ring buffer. This is not necessary and is a waste of memory in ring buffer. There is also a bug that dereferencing field->ksym_hbp in ksym_trace_output() can be invalid. Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Cc: "K.Prasad" Cc: Alan Stern Cc: Steven Rostedt LKML-Reference: <4A52E2A4.4050007@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 13 ++++--------- kernel/trace/trace_ksym.c | 26 ++++++++++++++++++-------- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 7d5cc37b8fca..ff1ef411a176 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -215,17 +215,12 @@ struct syscall_trace_exit { #define KSYM_SELFTEST_ENTRY "ksym_selftest_dummy" extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); -struct trace_ksym { +struct ksym_trace_entry { struct trace_entry ent; - struct hw_breakpoint *ksym_hbp; - unsigned long ksym_addr; unsigned long ip; -#ifdef CONFIG_PROFILE_KSYM_TRACER - unsigned long counter; -#endif - struct hlist_node ksym_hlist; + unsigned char type; char ksym_name[KSYM_NAME_LEN]; - char p_name[TASK_COMM_LEN]; + char cmd[TASK_COMM_LEN]; }; /* @@ -343,7 +338,7 @@ extern void __ftrace_bad_type(void); TRACE_SYSCALL_ENTER); \ IF_ASSIGN(var, ent, struct syscall_trace_exit, \ TRACE_SYSCALL_EXIT); \ - IF_ASSIGN(var, ent, struct trace_ksym, TRACE_KSYM); \ + IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\ __ftrace_bad_type(); \ } while (0) diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index eef97e7c8db7..085ff055fdfa 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -37,6 +37,15 @@ #define KSYM_TRACER_OP_LEN 3 /* rw- */ #define KSYM_FILTER_ENTRY_LEN (KSYM_NAME_LEN + KSYM_TRACER_OP_LEN + 1) +struct trace_ksym { + struct hw_breakpoint *ksym_hbp; + unsigned long ksym_addr; +#ifdef CONFIG_PROFILE_KSYM_TRACER + unsigned long counter; +#endif + struct hlist_node ksym_hlist; +}; + static struct trace_array *ksym_trace_array; static unsigned int ksym_filter_entry_count; @@ -71,7 +80,7 @@ void ksym_hbp_handler(struct hw_breakpoint *hbp, struct pt_regs *regs) { struct ring_buffer_event *event; struct trace_array *tr; - struct trace_ksym *entry; + struct ksym_trace_entry *entry; int pc; if (!ksym_tracing_enabled) @@ -85,11 +94,12 @@ void ksym_hbp_handler(struct hw_breakpoint *hbp, struct pt_regs *regs) if (!event) return; - entry = ring_buffer_event_data(event); + entry = ring_buffer_event_data(event); + entry->ip = instruction_pointer(regs); + entry->type = hbp->info.type; strlcpy(entry->ksym_name, hbp->info.name, KSYM_SYMBOL_LEN); - entry->ksym_hbp = hbp; - entry->ip = instruction_pointer(regs); - strlcpy(entry->p_name, current->comm, TASK_COMM_LEN); + strlcpy(entry->cmd, current->comm, TASK_COMM_LEN); + #ifdef CONFIG_PROFILE_KSYM_TRACER ksym_collect_stats(hbp->info.address); #endif /* CONFIG_PROFILE_KSYM_TRACER */ @@ -380,7 +390,7 @@ static enum print_line_t ksym_trace_output(struct trace_iterator *iter) { struct trace_entry *entry = iter->ent; struct trace_seq *s = &iter->seq; - struct trace_ksym *field; + struct ksym_trace_entry *field; char str[KSYM_SYMBOL_LEN]; int ret; @@ -389,12 +399,12 @@ static enum print_line_t ksym_trace_output(struct trace_iterator *iter) trace_assign_type(field, entry); - ret = trace_seq_printf(s, "%-15s %-5d %-3d %-20s ", field->p_name, + ret = trace_seq_printf(s, "%-15s %-5d %-3d %-20s ", field->cmd, entry->pid, iter->cpu, field->ksym_name); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - switch (field->ksym_hbp->info.type) { + switch (field->type) { case HW_BREAKPOINT_WRITE: ret = trace_seq_printf(s, " W "); break; -- cgit v1.2.3 From be9742e6cb107fe1d77db7a081ea4eb25e79e1ad Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 7 Jul 2009 13:52:52 +0800 Subject: ksym_tracer: Rewrite ksym_trace_filter_read() Reading ksym_trace_filter gave me some arbitrary characters, when it should show nothing. It's because buf is not initialized when there's no filter. Also reduce stack usage by about 512 bytes. Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Cc: "K.Prasad" Cc: Alan Stern Cc: Steven Rostedt LKML-Reference: <4A52E2B4.6030706@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_ksym.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index 085ff055fdfa..b6710d31bdf0 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -35,7 +35,6 @@ #define KSYM_TRACER_MAX HBP_NUM #define KSYM_TRACER_OP_LEN 3 /* rw- */ -#define KSYM_FILTER_ENTRY_LEN (KSYM_NAME_LEN + KSYM_TRACER_OP_LEN + 1) struct trace_ksym { struct hw_breakpoint *ksym_hbp; @@ -230,25 +229,33 @@ static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf, { struct trace_ksym *entry; struct hlist_node *node; - char buf[KSYM_FILTER_ENTRY_LEN * KSYM_TRACER_MAX]; - ssize_t ret, cnt = 0; + struct trace_seq *s; + ssize_t cnt = 0; + int ret; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + trace_seq_init(s); mutex_lock(&ksym_tracer_mutex); hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { - cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt, "%s:", - entry->ksym_hbp->info.name); + ret = trace_seq_printf(s, "%s:", entry->ksym_hbp->info.name); if (entry->ksym_hbp->info.type == HW_BREAKPOINT_WRITE) - cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt, - "-w-\n"); + ret = trace_seq_puts(s, "-w-\n"); else if (entry->ksym_hbp->info.type == HW_BREAKPOINT_RW) - cnt += snprintf(&buf[cnt], KSYM_FILTER_ENTRY_LEN - cnt, - "rw-\n"); + ret = trace_seq_puts(s, "rw-\n"); + WARN_ON_ONCE(!ret); } - ret = simple_read_from_buffer(ubuf, count, ppos, buf, strlen(buf)); + + cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); + mutex_unlock(&ksym_tracer_mutex); - return ret; + kfree(s); + + return cnt; } static ssize_t ksym_trace_filter_write(struct file *file, -- cgit v1.2.3 From f088e5471297cc78d7465e1fd997cb1a91a48019 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 7 Jul 2009 13:53:18 +0800 Subject: ksym_tracer: Fix validation of access type # echo 'pid_max:rw-' > ksym_trace_filter # cat ksym_trace_filter pid_max:rw- # echo 'pid_max:ww-' > ksym_trace_filter (should return -EINVAL) # cat ksym_trace_filter (but it ended up removing filter entry) Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Cc: "K.Prasad" Cc: Alan Stern Cc: Steven Rostedt LKML-Reference: <4A52E2CE.6080409@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_ksym.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index b6710d31bdf0..955600929907 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -114,24 +114,22 @@ void ksym_hbp_handler(struct hw_breakpoint *hbp, struct pt_regs *regs) * --x : Set Execution Break points (Not available yet) * */ -static int ksym_trace_get_access_type(char *access_str) +static int ksym_trace_get_access_type(char *str) { - int pos, access = 0; + int access = 0; - for (pos = 0; pos < KSYM_TRACER_OP_LEN; pos++) { - switch (access_str[pos]) { - case 'r': - access += (pos == 0) ? 4 : -1; - break; - case 'w': - access += (pos == 1) ? 2 : -1; - break; - case '-': - break; - default: - return -EINVAL; - } - } + if (str[0] == 'r') + access += 4; + else if (str[0] != '-') + return -EINVAL; + + if (str[1] == 'w') + access += 2; + else if (str[1] != '-') + return -EINVAL; + + if (str[2] != '-') + return -EINVAL; switch (access) { case 6: @@ -140,8 +138,6 @@ static int ksym_trace_get_access_type(char *access_str) case 2: access = HW_BREAKPOINT_WRITE; break; - case 0: - access = 0; } return access; -- cgit v1.2.3 From 92cf9f8f7e89c6bdbb1a724f879b8b18fc0dfe0f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 7 Jul 2009 13:53:47 +0800 Subject: ksym_tracer: Fix validation of length of access type Don't take newline into account, otherwise: # echo 'pid_max:-w-' > ksym_trace_filter # echo -n 'pid_max:rw-' > ksym_trace_filter bash: echo: write error: Invalid argument Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Cc: "K.Prasad" Cc: Alan Stern Cc: Steven Rostedt LKML-Reference: <4A52E2EB.9070503@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_ksym.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index 955600929907..72fcb46c39c0 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -158,21 +158,21 @@ static int ksym_trace_get_access_type(char *str) static int parse_ksym_trace_str(char *input_string, char **ksymname, unsigned long *addr) { - char *delimiter = ":"; int ret; - ret = -EINVAL; - *ksymname = strsep(&input_string, delimiter); + strstrip(input_string); + + *ksymname = strsep(&input_string, ":"); *addr = kallsyms_lookup_name(*ksymname); /* Check for malformed request: (2), (1) and (5) */ if ((!input_string) || - (strlen(input_string) != (KSYM_TRACER_OP_LEN + 1)) || - (*addr == 0)) - goto return_code; + (strlen(input_string) != KSYM_TRACER_OP_LEN) || + (*addr == 0)) + return -EINVAL;; + ret = ksym_trace_get_access_type(input_string); -return_code: return ret; } -- cgit v1.2.3 From 011ed56853e07e30653d6f1bfddc56b396218664 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 7 Jul 2009 13:54:08 +0800 Subject: ksym_tracer: NIL-terminate user input filter Make sure the user input string is NULL-terminated. Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Cc: "K.Prasad" Cc: Alan Stern Cc: Steven Rostedt LKML-Reference: <4A52E300.7020601@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_ksym.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index 72fcb46c39c0..8cbed5a6286f 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -264,11 +264,7 @@ static ssize_t ksym_trace_filter_write(struct file *file, unsigned long ksym_addr = 0; int ret, op, changed = 0; - /* Ignore echo "" > ksym_trace_filter */ - if (count == 0) - return 0; - - input_string = kzalloc(count, GFP_KERNEL); + input_string = kzalloc(count + 1, GFP_KERNEL); if (!input_string) return -ENOMEM; @@ -276,6 +272,7 @@ static ssize_t ksym_trace_filter_write(struct file *file, kfree(input_string); return -EFAULT; } + input_string[count] = '\0'; ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr); if (ret < 0) { -- cgit v1.2.3 From 0d109c8f70eab8b9f693bd5caea23012394e4876 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 7 Jul 2009 13:54:28 +0800 Subject: ksym_tracer: Report error when failed to re-register hbp When access type is changed, the hw break point will be unregistered and then be registered again with new access type. But the registration may fail, in this case, -errno should be returned. Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Cc: "K.Prasad" Cc: Alan Stern Cc: Steven Rostedt LKML-Reference: <4A52E314.7070004@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_ksym.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index 8cbed5a6286f..891e3b86b3f6 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -302,13 +302,13 @@ static ssize_t ksym_trace_filter_write(struct file *file, ret = count; goto unlock_ret_path; } - } + } else + ret = count; ksym_filter_entry_count--; hlist_del_rcu(&(entry->ksym_hlist)); synchronize_rcu(); kfree(entry->ksym_hbp); kfree(entry); - ret = count; goto err_ret; } else { /* Check for malformed request: (4) */ -- cgit v1.2.3 From 558df6c8f74ac4a0b9026ef85b0028280f364d96 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 7 Jul 2009 13:54:48 +0800 Subject: ksym_tracer: Fix memory leak - When remove a filter, we leak entry->ksym_hbp->info.name. - With CONFIG_FTRAC_SELFTEST enabled, we leak ->info.name: # echo ksym_tracer > current_tracer # echo 'ksym_selftest_dummy:rw-' > ksym_trace_filter # echo nop > current_tracer Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Cc: "K.Prasad" Cc: Alan Stern Cc: Steven Rostedt LKML-Reference: <4A52E328.8010200@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_ksym.c | 61 +++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 34 deletions(-) diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index 891e3b86b3f6..7d349d34a0d1 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -179,7 +179,7 @@ static int parse_ksym_trace_str(char *input_string, char **ksymname, int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) { struct trace_ksym *entry; - int ret; + int ret = -ENOMEM; if (ksym_filter_entry_count >= KSYM_TRACER_MAX) { printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No" @@ -193,12 +193,13 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) return -ENOMEM; entry->ksym_hbp = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL); - if (!entry->ksym_hbp) { - kfree(entry); - return -ENOMEM; - } + if (!entry->ksym_hbp) + goto err; + + entry->ksym_hbp->info.name = kstrdup(ksymname, GFP_KERNEL); + if (!entry->ksym_hbp->info.name) + goto err; - entry->ksym_hbp->info.name = ksymname; entry->ksym_hbp->info.type = op; entry->ksym_addr = entry->ksym_hbp->info.address = addr; #ifdef CONFIG_X86 @@ -210,14 +211,18 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) if (ret < 0) { printk(KERN_INFO "ksym_tracer request failed. Try again" " later!!\n"); - kfree(entry->ksym_hbp); - kfree(entry); - return -EAGAIN; + ret = -EAGAIN; + goto err; } hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head); ksym_filter_entry_count++; - return 0; +err: + if (entry->ksym_hbp) + kfree(entry->ksym_hbp->info.name); + kfree(entry->ksym_hbp); + kfree(entry); + return ret; } static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf, @@ -289,7 +294,7 @@ static ssize_t ksym_trace_filter_write(struct file *file, if (entry->ksym_hbp->info.type != op) changed = 1; else - goto err_ret; + goto out; break; } } @@ -298,34 +303,29 @@ static ssize_t ksym_trace_filter_write(struct file *file, entry->ksym_hbp->info.type = op; if (op > 0) { ret = register_kernel_hw_breakpoint(entry->ksym_hbp); - if (ret == 0) { - ret = count; - goto unlock_ret_path; - } - } else - ret = count; + if (ret == 0) + goto out; + } ksym_filter_entry_count--; hlist_del_rcu(&(entry->ksym_hlist)); synchronize_rcu(); + kfree(entry->ksym_hbp->info.name); kfree(entry->ksym_hbp); kfree(entry); - goto err_ret; + goto out; } else { /* Check for malformed request: (4) */ if (op == 0) - goto err_ret; + goto out; ret = process_new_ksym_entry(ksymname, op, ksym_addr); - if (ret) - goto err_ret; } - ret = count; - goto unlock_ret_path; +out: + mutex_unlock(&ksym_tracer_mutex); -err_ret: kfree(input_string); -unlock_ret_path: - mutex_unlock(&ksym_tracer_mutex); + if (!ret) + ret = count; return ret; } @@ -349,14 +349,7 @@ static void ksym_trace_reset(struct trace_array *tr) ksym_filter_entry_count--; hlist_del_rcu(&(entry->ksym_hlist)); synchronize_rcu(); - /* Free the 'input_string' only if reset - * after startup self-test - */ -#ifdef CONFIG_FTRACE_SELFTEST - if (strncmp(entry->ksym_hbp->info.name, KSYM_SELFTEST_ENTRY, - strlen(KSYM_SELFTEST_ENTRY)) != 0) -#endif /* CONFIG_FTRACE_SELFTEST*/ - kfree(entry->ksym_hbp->info.name); + kfree(entry->ksym_hbp->info.name); kfree(entry->ksym_hbp); kfree(entry); } -- cgit v1.2.3 From 9d7e934408b52cd53dd85270eb36941a6a318cc5 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 7 Jul 2009 13:55:18 +0800 Subject: ksym_tracer: Fix the output of stat tracing - make ksym_tracer_stat_start() return head->first instead of &head->first - make the output properly aligned Before: Access type Symbol Counter NA 0 RW pid_max 0 After: Access Type Symbol Counter ----------- ------ ------- RW pid_max 0 Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Cc: "K.Prasad" Cc: Alan Stern Cc: Steven Rostedt LKML-Reference: <4A52E346.5050608@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_ksym.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index 7d349d34a0d1..1256a6e8ee24 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -453,8 +453,10 @@ device_initcall(init_ksym_trace); #ifdef CONFIG_PROFILE_KSYM_TRACER static int ksym_tracer_stat_headers(struct seq_file *m) { - seq_printf(m, " Access type "); - seq_printf(m, " Symbol Counter \n"); + seq_puts(m, " Access Type "); + seq_puts(m, " Symbol Counter\n"); + seq_puts(m, " ----------- "); + seq_puts(m, " ------ -------\n"); return 0; } @@ -472,27 +474,27 @@ static int ksym_tracer_stat_show(struct seq_file *m, void *v) switch (access_type) { case HW_BREAKPOINT_WRITE: - seq_printf(m, " W "); + seq_puts(m, " W "); break; case HW_BREAKPOINT_RW: - seq_printf(m, " RW "); + seq_puts(m, " RW "); break; default: - seq_printf(m, " NA "); + seq_puts(m, " NA "); } if (lookup_symbol_name(entry->ksym_addr, fn_name) >= 0) - seq_printf(m, " %s ", fn_name); + seq_printf(m, " %-36s", fn_name); else - seq_printf(m, " "); + seq_printf(m, " %-36s", ""); + seq_printf(m, " %15lu\n", entry->counter); - seq_printf(m, "%15lu\n", entry->counter); return 0; } static void *ksym_tracer_stat_start(struct tracer_stat *trace) { - return &(ksym_filter_head.first); + return ksym_filter_head.first; } static void * -- cgit v1.2.3 From 68baafcfc46074c4bb4e4c3115c2c76a8a85f37d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 9 Jul 2009 04:46:29 +0200 Subject: tracing/function-graph-tracer: Use the %pf format Remove the obsolete seq_print_ip_sym() usage and replace it by the %pf format in order to print function symbols. Signed-off-by: Frederic Weisbecker Reviewed-by: Li Zefan Cc: Steven Rostedt Cc: Lai Jiangshan LKML-Reference: <1247107590-6428-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_functions_graph.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d2249abafb53..abf7c4ae2c8b 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -565,11 +565,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } - ret = seq_print_ip_sym(s, call->func, 0); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_printf(s, "();\n"); + ret = trace_seq_printf(s, "%pf();\n", (void *)call->func); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -612,11 +608,7 @@ print_graph_entry_nested(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } - ret = seq_print_ip_sym(s, call->func, 0); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_printf(s, "() {\n"); + ret = trace_seq_printf(s, "%pf() {\n", (void *)call->func); if (!ret) return TRACE_TYPE_PARTIAL_LINE; -- cgit v1.2.3 From 6a167c655858cbec4175532fd00417661c87f149 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 9 Jul 2009 04:46:30 +0200 Subject: tracing/kmemtrace: Use the %pf format Remove the obsolete seq_print_ip_sym() usage and replace it by the %pf format in order to print function symbols. Signed-off-by: Frederic Weisbecker Reviewed-by: Li Zefan Cc: Steven Rostedt Cc: Lai Jiangshan Cc: Pekka Enberg Cc: Eduard - Gabriel Munteanu LKML-Reference: <1247107590-6428-3-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/kmemtrace.c | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 74903b62bcb6..2f6fa47d410c 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -389,19 +389,12 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter) if (!ret) return TRACE_TYPE_PARTIAL_LINE; - /* Node */ - ret = trace_seq_printf(s, "%4d ", entry->node); + /* Node and call site*/ + ret = trace_seq_printf(s, "%4d %pf\n", entry->node, + (void *)entry->call_site); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - /* Call site */ - ret = seq_print_ip_sym(s, entry->call_site, 0); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - if (!trace_seq_printf(s, "\n")) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; } @@ -447,19 +440,11 @@ kmemtrace_print_free_compress(struct trace_iterator *iter) if (!ret) return TRACE_TYPE_PARTIAL_LINE; - /* Skip node */ - ret = trace_seq_printf(s, " "); + /* Skip node and print call site*/ + ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - /* Call site */ - ret = seq_print_ip_sym(s, entry->call_site, 0); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - if (!trace_seq_printf(s, "\n")) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; } -- cgit v1.2.3 From 80098c200e2ee3b4c86a9d1e156dbcd05380e08f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 6 Jul 2009 16:15:04 +0800 Subject: kmemtrace: Rename some functions So we have: - kmemtrace_print_alloc/free() for kmemtrace default output - kmemtrace_print_alloc/free_user() for binary output used by kmemtrace-user. Suggested-by: Eduard - Gabriel Munteanu Signed-off-by: Li Zefan Acked-by: Pekka Enberg Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A51B288.70505@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/kmemtrace.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 2f6fa47d410c..dda53ccf749b 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -239,7 +239,7 @@ struct kmemtrace_user_event_alloc { }; static enum print_line_t -kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags) +kmemtrace_print_alloc(struct trace_iterator *iter, int flags) { struct trace_seq *s = &iter->seq; struct kmemtrace_alloc_entry *entry; @@ -259,7 +259,7 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags) } static enum print_line_t -kmemtrace_print_free_user(struct trace_iterator *iter, int flags) +kmemtrace_print_free(struct trace_iterator *iter, int flags) { struct trace_seq *s = &iter->seq; struct kmemtrace_free_entry *entry; @@ -277,7 +277,7 @@ kmemtrace_print_free_user(struct trace_iterator *iter, int flags) } static enum print_line_t -kmemtrace_print_alloc_user_bin(struct trace_iterator *iter, int flags) +kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags) { struct trace_seq *s = &iter->seq; struct kmemtrace_alloc_entry *entry; @@ -311,7 +311,7 @@ kmemtrace_print_alloc_user_bin(struct trace_iterator *iter, int flags) } static enum print_line_t -kmemtrace_print_free_user_bin(struct trace_iterator *iter, int flags) +kmemtrace_print_free_user(struct trace_iterator *iter, int flags) { struct trace_seq *s = &iter->seq; struct kmemtrace_free_entry *entry; @@ -467,14 +467,14 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter) static struct trace_event kmem_trace_alloc = { .type = TRACE_KMEM_ALLOC, - .trace = kmemtrace_print_alloc_user, - .binary = kmemtrace_print_alloc_user_bin, + .trace = kmemtrace_print_alloc, + .binary = kmemtrace_print_alloc_user, }; static struct trace_event kmem_trace_free = { .type = TRACE_KMEM_FREE, - .trace = kmemtrace_print_free_user, - .binary = kmemtrace_print_free_user_bin, + .trace = kmemtrace_print_free, + .binary = kmemtrace_print_free_user, }; static struct tracer kmem_tracer __read_mostly = { -- cgit v1.2.3 From d8ea37d5de58d35a39d0b4e7d209751aaa1b8174 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 6 Jul 2009 16:10:18 +0800 Subject: tracing/stat: Add stat_release() callback Add stat_release() callback to struct tracer_stat, so a stat tracer can release it's entries after the stat file has been read out. Signed-off-by: Lai Jiangshan Signed-off-by: Li Zefan Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A51B16A.6020708@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_stat.c | 7 +++++-- kernel/trace/trace_stat.h | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index e66f5e493342..f069461f10bd 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -49,7 +49,8 @@ static struct dentry *stat_dir; * but it will at least advance closer to the next one * to be released. */ -static struct rb_node *release_next(struct rb_node *node) +static struct rb_node *release_next(struct tracer_stat *ts, + struct rb_node *node) { struct stat_node *snode; struct rb_node *parent = rb_parent(node); @@ -67,6 +68,8 @@ static struct rb_node *release_next(struct rb_node *node) parent->rb_right = NULL; snode = container_of(node, struct stat_node, node); + if (ts->stat_release) + ts->stat_release(snode->stat); kfree(snode); return parent; @@ -78,7 +81,7 @@ static void reset_stat_session(struct stat_session *session) struct rb_node *node = session->stat_root.rb_node; while (node) - node = release_next(node); + node = release_next(session->ts, node); session->stat_root = RB_ROOT; } diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h index f3546a2cd826..8f03914b9a6a 100644 --- a/kernel/trace/trace_stat.h +++ b/kernel/trace/trace_stat.h @@ -18,6 +18,8 @@ struct tracer_stat { int (*stat_cmp)(void *p1, void *p2); /* Print a stat entry */ int (*stat_show)(struct seq_file *s, void *p); + /* Release an entry */ + void (*stat_release)(void *stat); /* Print the headers of your stat entries */ int (*stat_headers)(struct seq_file *s); }; -- cgit v1.2.3 From a35780005eb256eb5ec83ffcc802967295887a45 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 6 Jul 2009 16:10:23 +0800 Subject: tracing/workqueues: Add refcnt to struct cpu_workqueue_stats The stat entries can be freed when the stat file is being read. The worse is, the ptr can be freed immediately after it's returned from workqueue_stat_start/next(). Add a refcnt to struct cpu_workqueue_stats to avoid use-after-free. Signed-off-by: Lai Jiangshan Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <4A51B16F.6010608@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_workqueue.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 97fcea4acce1..40cafb07dffd 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "trace_stat.h" #include "trace.h" @@ -16,6 +17,7 @@ /* A cpu workqueue thread */ struct cpu_workqueue_stats { struct list_head list; + struct kref kref; int cpu; pid_t pid; /* Can be inserted from interrupt or user context, need to be atomic */ @@ -39,6 +41,11 @@ struct workqueue_global_stats { static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); #define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) +static void cpu_workqueue_stat_free(struct kref *kref) +{ + kfree(container_of(kref, struct cpu_workqueue_stats, kref)); +} + /* Insertion of a work */ static void probe_workqueue_insertion(struct task_struct *wq_thread, @@ -96,8 +103,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) return; } INIT_LIST_HEAD(&cws->list); + kref_init(&cws->kref); cws->cpu = cpu; - cws->pid = wq_thread->pid; spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); @@ -118,7 +125,7 @@ static void probe_workqueue_destruction(struct task_struct *wq_thread) list) { if (node->pid == wq_thread->pid) { list_del(&node->list); - kfree(node); + kref_put(&node->kref, cpu_workqueue_stat_free); goto found; } } @@ -137,9 +144,11 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - if (!list_empty(&workqueue_cpu_stat(cpu)->list)) + if (!list_empty(&workqueue_cpu_stat(cpu)->list)) { ret = list_entry(workqueue_cpu_stat(cpu)->list.next, struct cpu_workqueue_stats, list); + kref_get(&ret->kref); + } spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); @@ -162,9 +171,9 @@ static void *workqueue_stat_start(struct tracer_stat *trace) static void *workqueue_stat_next(void *prev, int idx) { struct cpu_workqueue_stats *prev_cws = prev; + struct cpu_workqueue_stats *ret; int cpu = prev_cws->cpu; unsigned long flags; - void *ret = NULL; spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { @@ -175,11 +184,14 @@ static void *workqueue_stat_next(void *prev, int idx) return NULL; } while (!(ret = workqueue_stat_start_cpu(cpu))); return ret; + } else { + ret = list_entry(prev_cws->list.next, + struct cpu_workqueue_stats, list); + kref_get(&ret->kref); } spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - return list_entry(prev_cws->list.next, struct cpu_workqueue_stats, - list); + return ret; } static int workqueue_stat_show(struct seq_file *s, void *p) @@ -203,6 +215,13 @@ static int workqueue_stat_show(struct seq_file *s, void *p) return 0; } +static void workqueue_stat_release(void *stat) +{ + struct cpu_workqueue_stats *node = stat; + + kref_put(&node->kref, cpu_workqueue_stat_free); +} + static int workqueue_stat_headers(struct seq_file *s) { seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); @@ -215,6 +234,7 @@ struct tracer_stat workqueue_stats __read_mostly = { .stat_start = workqueue_stat_start, .stat_next = workqueue_stat_next, .stat_show = workqueue_stat_show, + .stat_release = workqueue_stat_release, .stat_headers = workqueue_stat_headers }; -- cgit v1.2.3 From 8259cf4342029aad37660e524178c8858f48b0ab Mon Sep 17 00:00:00 2001 From: Robin Getz Date: Thu, 9 Jul 2009 13:08:37 -0400 Subject: printk: Ensure that "console enabled" messages are printed on the console Today, when a console is registered without CON_PRINTBUFFER, end users never see the announcement of it being added, and never know if they missed something, if the console is really at the start or not, and just leads to general confusion. This re-orders existing code, to make sure the console is added, before the "console [%s%d] enabled" is printed out - ensuring that this message is _always_ seen. This has the desired/intended side effect of making sure that "console enabled:" messages are printed on the bootconsole, and the real console. This does cause the same line is printed twice if the bootconsole and real console are the same device, but if they are on different devices, the message is printed to both consoles. Signed-off-by : Robin Getz Cc: "Andrew Morton" Cc: "Linus Torvalds" LKML-Reference: <200907091308.37370.rgetz@blackfin.uclinux.org> Signed-off-by: Ingo Molnar --- kernel/printk.c | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/kernel/printk.c b/kernel/printk.c index 41fe60995530..668df351e6a9 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1240,22 +1240,14 @@ void register_console(struct console *newcon) if (!(newcon->flags & CON_ENABLED)) return; - if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { - /* we need to iterate through twice, to make sure we print - * everything out, before we unregister the console(s) - */ - printk(KERN_INFO "console handover:"); - for_each_console(bcon) - printk("boot [%s%d] ", bcon->name, bcon->index); - printk(" -> real [%s%d]\n", newcon->name, newcon->index); - for_each_console(bcon) - unregister_console(bcon); + /* + * If we have a bootconsole, and are switching to a real console, + * don't print everything out again, since when the boot console, and + * the real console are the same physical device, it's annoying to + * see the beginning boot messages twice + */ + if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) newcon->flags &= ~CON_PRINTBUFFER; - } else { - printk(KERN_INFO "%sconsole [%s%d] enabled\n", - (newcon->flags & CON_BOOT) ? "boot" : "" , - newcon->name, newcon->index); - } /* * Put this console in the list - keep the @@ -1281,6 +1273,28 @@ void register_console(struct console *newcon) spin_unlock_irqrestore(&logbuf_lock, flags); } release_console_sem(); + + /* + * By unregistering the bootconsoles after we enable the real console + * we get the "console xxx enabled" message on all the consoles - + * boot consoles, real consoles, etc - this is to ensure that end + * users know there might be something in the kernel's log buffer that + * went to the bootconsole (that they do not see on the real console) + */ + if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { + /* we need to iterate through twice, to make sure we print + * everything out, before we unregister the console(s) + */ + printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n", + newcon->name, newcon->index); + for_each_console(bcon) + if (bcon->flags & CON_BOOT) + unregister_console(bcon); + } else { + printk(KERN_INFO "%sconsole [%s%d] enabled\n", + (newcon->flags & CON_BOOT) ? "boot" : "" , + newcon->name, newcon->index); + } } EXPORT_SYMBOL(register_console); -- cgit v1.2.3 From 9ff80942992cd5abd0779c815f310f65b7b83860 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 8 Jul 2009 22:03:53 +0400 Subject: x86: Clean up idt_descr and idt_tableby using NR_VECTORS instead of hardcoded number Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu LKML-Reference: <20090708180353.GH5301@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/traps.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f1961c07af9a..d6f27c92854b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -982,7 +982,7 @@ static __init int setup_disablecpuid(char *arg) __setup("clearcpuid=", setup_disablecpuid); #ifdef CONFIG_X86_64 -struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; +struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 5204332f475d..7e4b1f5dec8e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -76,7 +76,7 @@ char ignore_fpu_irq; * F0 0F bug workaround.. We have a special link segment * for this. */ -gate_desc idt_table[256] +gate_desc idt_table[NR_VECTORS] __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; #endif -- cgit v1.2.3 From a1b4f1a5b7f57be2593a9f1fca465a529c95fc07 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 5 Jul 2009 20:01:54 +0400 Subject: x86, ipi: Clean up safe_smp_processor_id() by using the cpu_has_apic() macro helper We already use a lot of cpu_has_ helpers. Lets do here the same for consistency. Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu LKML-Reference: <20090705160154.GB4791@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/ipi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index dbf5445727a9..e6b4f517fcfe 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -150,7 +150,7 @@ int safe_smp_processor_id(void) { int apicid, cpuid; - if (!boot_cpu_has(X86_FEATURE_APIC)) + if (!cpu_has_apic) return 0; apicid = hard_smp_processor_id(); -- cgit v1.2.3 From 1aaad49e856ce41adc07d8ae0c8ef35fc4483245 Mon Sep 17 00:00:00 2001 From: Frans Pop Date: Mon, 6 Jul 2009 13:31:48 +0200 Subject: printk: Restore previous console_loglevel when re-enabling logging When logging to console is disabled from userspace using klogctl() and later re-enabled, console_loglevel gets set to the default log level instead to the previous value. This means that if the kernel was booted with 'quiet', the boot is suddenly no longer quiet after logging to console gets re-enabled. Save the current console_loglevel when logging is disabled and restore to that value. If the log level is set to a specific value while disabled, this is interpreted as an implicit re-enabling of the logging. The problem that prompted this patch is described in: http://lkml.org/lkml/2009/6/28/234 There are two variations possible on the patch below: 1) If klogctl(7) is called while logging is not disabled, then set level to default (partially preserving current functionality): case 7: /* Enable logging to console */ - console_loglevel = default_console_loglevel; + if (saved_console_loglevel == -1) + console_loglevel = default_console_loglevel; + else { + console_loglevel = saved_console_loglevel; + saved_console_loglevel = -1; + } 2) If klogctl(8) is called while logging is disabled, then don't enable logging, but remember the requested value for when logging does get enabled again: case 8: /* Set level of messages printed to console */ [...] - console_loglevel = len; + if (saved_console_loglevel == -1) + console_loglevel = len; + else + saved_console_loglevel = len; Yet another option would be to ignore the request. Signed-off-by: Frans Pop Cc: cryptsetup@packages.debian.org Cc: Andrew Morton Cc: Linus Torvalds LKML-Reference: <200907061331.49930.elendil@planet.nl> Signed-off-by: Ingo Molnar --- kernel/printk.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/kernel/printk.c b/kernel/printk.c index 668df351e6a9..e0daaf57985a 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -67,6 +67,8 @@ int console_printk[4] = { DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ }; +static int saved_console_loglevel = -1; + /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. @@ -378,10 +380,15 @@ int do_syslog(int type, char __user *buf, int len) logged_chars = 0; break; case 6: /* Disable logging to console */ + if (saved_console_loglevel == -1) + saved_console_loglevel = console_loglevel; console_loglevel = minimum_console_loglevel; break; case 7: /* Enable logging to console */ - console_loglevel = default_console_loglevel; + if (saved_console_loglevel != -1) { + console_loglevel = saved_console_loglevel; + saved_console_loglevel = -1; + } break; case 8: /* Set level of messages printed to console */ error = -EINVAL; @@ -390,6 +397,8 @@ int do_syslog(int type, char __user *buf, int len) if (len < minimum_console_loglevel) len = minimum_console_loglevel; console_loglevel = len; + /* Implicitly re-enable logging to console */ + saved_console_loglevel = -1; error = 0; break; case 9: /* Number of chars in the log buffer */ -- cgit v1.2.3 From e90476d3bab4322070c0afb3e3b55671de8664ea Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Sat, 11 Jul 2009 09:32:46 +0800 Subject: x86: Remove duplicated #include Remove duplicated #include in: arch/x86/kernel/dumpstack.c Signed-off-by: Huang Weiyi Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index c8405718a4c3..2d8a371d4339 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -15,7 +15,6 @@ #include #include #include -#include #include -- cgit v1.2.3 From 8bdbd962ecfcbdd96f9dbb02d780b4553afd2543 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Sat, 4 Jul 2009 00:35:45 +0100 Subject: x86/cpu: Clean up various files a bit No code changes except printk levels (although some of the K6 mtrr code might be clearer if there were a few as would splitting out some of the intel cache code). Signed-off-by: Alan Cox LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 37 ++++++----- arch/x86/kernel/cpu/bugs.c | 10 +-- arch/x86/kernel/cpu/bugs_64.c | 2 +- arch/x86/kernel/cpu/common.c | 8 +-- arch/x86/kernel/cpu/cyrix.c | 19 ++++-- arch/x86/kernel/cpu/hypervisor.c | 5 +- arch/x86/kernel/cpu/intel.c | 11 ++-- arch/x86/kernel/cpu/intel_cacheinfo.c | 116 +++++++++++++++++---------------- arch/x86/kernel/cpu/perfctr-watchdog.c | 45 ++++++------- arch/x86/kernel/cpu/proc.c | 2 +- arch/x86/kernel/cpu/vmware.c | 18 ++--- 11 files changed, 144 insertions(+), 129 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 28e5f5956042..c6eb02e69875 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include #include @@ -45,8 +45,8 @@ static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c) #define CBAR_ENB (0x80000000) #define CBAR_KEY (0X000000CB) if (c->x86_model == 9 || c->x86_model == 10) { - if (inl (CBAR) & CBAR_ENB) - outl (0 | CBAR_KEY, CBAR); + if (inl(CBAR) & CBAR_ENB) + outl(0 | CBAR_KEY, CBAR); } } @@ -87,9 +87,10 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) d = d2-d; if (d > 20*K6_BUG_LOOP) - printk("system stability may be impaired when more than 32 MB are used.\n"); + printk(KERN_CONT + "system stability may be impaired when more than 32 MB are used.\n"); else - printk("probably OK (after B9730xxxx).\n"); + printk(KERN_CONT "probably OK (after B9730xxxx).\n"); printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n"); } @@ -219,8 +220,9 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { rdmsr(MSR_K7_CLK_CTL, l, h); if ((l & 0xfff00000) != 0x20000000) { - printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l, - ((l & 0x000fffff)|0x20000000)); + printk(KERN_INFO + "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", + l, ((l & 0x000fffff)|0x20000000)); wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); } } @@ -398,7 +400,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) u32 level; level = cpuid_eax(1); - if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) + if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) set_cpu_cap(c, X86_FEATURE_REP_GOOD); } if (c->x86 == 0x10 || c->x86 == 0x11) @@ -487,27 +489,30 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) * benefit in doing so. */ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { - printk(KERN_DEBUG "tseg: %010llx\n", tseg); - if ((tseg>>PMD_SHIFT) < + printk(KERN_DEBUG "tseg: %010llx\n", tseg); + if ((tseg>>PMD_SHIFT) < (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || - ((tseg>>PMD_SHIFT) < + ((tseg>>PMD_SHIFT) < (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && - (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) - set_memory_4k((unsigned long)__va(tseg), 1); + (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) + set_memory_4k((unsigned long)__va(tseg), 1); } } #endif } #ifdef CONFIG_X86_32 -static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) +static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, + unsigned int size) { /* AMD errata T13 (order #21922) */ if ((c->x86 == 6)) { - if (c->x86_model == 3 && c->x86_mask == 0) /* Duron Rev A0 */ + /* Duron Rev A0 */ + if (c->x86_model == 3 && c->x86_mask == 0) size = 64; + /* Tbird rev A1/A2 */ if (c->x86_model == 4 && - (c->x86_mask == 0 || c->x86_mask == 1)) /* Tbird rev A1/A2 */ + (c->x86_mask == 0 || c->x86_mask == 1)) size = 256; } return size; diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index c8e315f1aa83..01a265212395 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -81,7 +81,7 @@ static void __init check_fpu(void) boot_cpu_data.fdiv_bug = fdiv_bug; if (boot_cpu_data.fdiv_bug) - printk("Hmm, FPU with FDIV bug.\n"); + printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n"); } static void __init check_hlt(void) @@ -98,7 +98,7 @@ static void __init check_hlt(void) halt(); halt(); halt(); - printk("OK.\n"); + printk(KERN_CONT "OK.\n"); } /* @@ -122,9 +122,9 @@ static void __init check_popad(void) * CPU hard. Too bad. */ if (res != 12345678) - printk("Buggy.\n"); + printk(KERN_CONT "Buggy.\n"); else - printk("OK.\n"); + printk(KERN_CONT "OK.\n"); #endif } @@ -156,7 +156,7 @@ void __init check_bugs(void) { identify_boot_cpu(); #ifndef CONFIG_SMP - printk("CPU: "); + printk(KERN_INFO "CPU: "); print_cpu_info(&boot_cpu_data); #endif check_config(); diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c index 9a3ed0649d4e..04f0fe5af83e 100644 --- a/arch/x86/kernel/cpu/bugs_64.c +++ b/arch/x86/kernel/cpu/bugs_64.c @@ -15,7 +15,7 @@ void __init check_bugs(void) { identify_boot_cpu(); #if !defined(CONFIG_SMP) - printk("CPU: "); + printk(KERN_INFO "CPU: "); print_cpu_info(&boot_cpu_data); #endif alternative_instructions(); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d6f27c92854b..c96ea44928bf 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -18,8 +18,8 @@ #include #include #include -#include -#include +#include +#include #include #include #include @@ -28,13 +28,13 @@ #include #include #include -#include +#include #include #include #include #include #include -#include +#include #ifdef CONFIG_X86_LOCAL_APIC #include diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 593171e967ef..19807b89f058 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -3,10 +3,10 @@ #include #include #include -#include +#include #include #include -#include +#include #include #include @@ -282,7 +282,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) * The 5510/5520 companion chips have a funky PIT. */ if (vendor == PCI_VENDOR_ID_CYRIX && - (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520)) + (device == PCI_DEVICE_ID_CYRIX_5510 || + device == PCI_DEVICE_ID_CYRIX_5520)) mark_tsc_unstable("cyrix 5510/5520 detected"); } #endif @@ -299,7 +300,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) * ? : 0x7x * GX1 : 0x8x GX1 datasheet 56 */ - if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f)) + if ((0x30 <= dir1 && dir1 <= 0x6f) || + (0x80 <= dir1 && dir1 <= 0x8f)) geode_configure(); return; } else { /* MediaGX */ @@ -427,9 +429,12 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n"); local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); - setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ - setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); /* enable cpuid */ - setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + /* enable MAPEN */ + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); + /* enable cpuid */ + setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); + /* disable MAPEN */ + setCx86(CX86_CCR3, ccr3); local_irq_restore(flags); } } diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index fb5b86af0b01..93ba8eeb100a 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -28,11 +28,10 @@ static inline void __cpuinit detect_hypervisor_vendor(struct cpuinfo_x86 *c) { - if (vmware_platform()) { + if (vmware_platform()) c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE; - } else { + else c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; - } } unsigned long get_hypervisor_tsc_freq(void) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 3260ab044996..80a722a071b5 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -7,17 +7,17 @@ #include #include #include +#include #include #include #include -#include #include #include #include #ifdef CONFIG_X86_64 -#include +#include #include #endif @@ -174,7 +174,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_F00F_BUG /* * All current models of Pentium and Pentium with MMX technology CPUs - * have the F0 0F bug, which lets nonprivileged users lock up the system. + * have the F0 0F bug, which lets nonprivileged users lock up the + * system. * Note that the workaround only should be initialized once... */ c->f00f_bug = 0; @@ -207,7 +208,7 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n"); printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n"); lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE; - wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); + wrmsr(MSR_IA32_MISC_ENABLE, lo, hi); } } @@ -283,7 +284,7 @@ static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) /* Intel has a non-standard dependency on %ecx for this CPUID level. */ cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); if (eax & 0x1f) - return ((eax >> 26) + 1); + return (eax >> 26) + 1; else return 1; } diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 789efe217e1a..306bf0dca061 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -3,7 +3,7 @@ * * Changes: * Venkatesh Pallipadi : Adding cache identification through cpuid(4) - * Ashok Raj : Work with CPU hotplug infrastructure. + * Ashok Raj : Work with CPU hotplug infrastructure. * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. */ @@ -16,7 +16,7 @@ #include #include -#include +#include #include #define LVL_1_INST 1 @@ -25,14 +25,15 @@ #define LVL_3 4 #define LVL_TRACE 5 -struct _cache_table -{ +struct _cache_table { unsigned char descriptor; char cache_type; short size; }; -/* all the cache descriptor types we care about (no TLB or trace cache entries) */ +/* All the cache descriptor types we care about (no TLB or + trace cache entries) */ + static const struct _cache_table __cpuinitconst cache_table[] = { { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ @@ -105,8 +106,7 @@ static const struct _cache_table __cpuinitconst cache_table[] = }; -enum _cache_type -{ +enum _cache_type { CACHE_TYPE_NULL = 0, CACHE_TYPE_DATA = 1, CACHE_TYPE_INST = 2, @@ -170,31 +170,31 @@ unsigned short num_cache_leaves; Maybe later */ union l1_cache { struct { - unsigned line_size : 8; - unsigned lines_per_tag : 8; - unsigned assoc : 8; - unsigned size_in_kb : 8; + unsigned line_size:8; + unsigned lines_per_tag:8; + unsigned assoc:8; + unsigned size_in_kb:8; }; unsigned val; }; union l2_cache { struct { - unsigned line_size : 8; - unsigned lines_per_tag : 4; - unsigned assoc : 4; - unsigned size_in_kb : 16; + unsigned line_size:8; + unsigned lines_per_tag:4; + unsigned assoc:4; + unsigned size_in_kb:16; }; unsigned val; }; union l3_cache { struct { - unsigned line_size : 8; - unsigned lines_per_tag : 4; - unsigned assoc : 4; - unsigned res : 2; - unsigned size_encoded : 14; + unsigned line_size:8; + unsigned lines_per_tag:4; + unsigned assoc:4; + unsigned res:2; + unsigned size_encoded:14; }; unsigned val; }; @@ -350,7 +350,8 @@ static int __cpuinit find_num_cache_leaves(void) unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) { - unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */ + /* Cache sizes */ + unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; @@ -377,8 +378,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) retval = cpuid4_cache_lookup_regs(i, &this_leaf); if (retval >= 0) { - switch(this_leaf.eax.split.level) { - case 1: + switch (this_leaf.eax.split.level) { + case 1: if (this_leaf.eax.split.type == CACHE_TYPE_DATA) new_l1d = this_leaf.size/1024; @@ -386,19 +387,20 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) CACHE_TYPE_INST) new_l1i = this_leaf.size/1024; break; - case 2: + case 2: new_l2 = this_leaf.size/1024; num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; index_msb = get_count_order(num_threads_sharing); l2_id = c->apicid >> index_msb; break; - case 3: + case 3: new_l3 = this_leaf.size/1024; num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); + index_msb = get_count_order( + num_threads_sharing); l3_id = c->apicid >> index_msb; break; - default: + default: break; } } @@ -421,22 +423,21 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) /* Number of times to iterate */ n = cpuid_eax(2) & 0xFF; - for ( i = 0 ; i < n ; i++ ) { + for (i = 0 ; i < n ; i++) { cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); /* If bit 31 is set, this is an unknown format */ - for ( j = 0 ; j < 3 ; j++ ) { - if (regs[j] & (1 << 31)) regs[j] = 0; - } + for (j = 0 ; j < 3 ; j++) + if (regs[j] & (1 << 31)) + regs[j] = 0; /* Byte 0 is level count, not a descriptor */ - for ( j = 1 ; j < 16 ; j++ ) { + for (j = 1 ; j < 16 ; j++) { unsigned char des = dp[j]; unsigned char k = 0; /* look up this descriptor in the table */ - while (cache_table[k].descriptor != 0) - { + while (cache_table[k].descriptor != 0) { if (cache_table[k].descriptor == des) { if (only_trace && cache_table[k].cache_type != LVL_TRACE) break; @@ -488,14 +489,14 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) } if (trace) - printk (KERN_INFO "CPU: Trace cache: %dK uops", trace); - else if ( l1i ) - printk (KERN_INFO "CPU: L1 I cache: %dK", l1i); + printk(KERN_INFO "CPU: Trace cache: %dK uops", trace); + else if (l1i) + printk(KERN_INFO "CPU: L1 I cache: %dK", l1i); if (l1d) - printk(", L1 D cache: %dK\n", l1d); + printk(KERN_CONT ", L1 D cache: %dK\n", l1d); else - printk("\n"); + printk(KERN_CONT "\n"); if (l2) printk(KERN_INFO "CPU: L2 cache: %dK\n", l2); @@ -558,8 +559,13 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) } } #else -static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) {} -static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) {} +static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) +{ +} + +static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) +{ +} #endif static void __cpuinit free_cache_attributes(unsigned int cpu) @@ -645,7 +651,7 @@ static DEFINE_PER_CPU(struct _index_kobject *, index_kobject); static ssize_t show_##file_name \ (struct _cpuid4_info *this_leaf, char *buf) \ { \ - return sprintf (buf, "%lu\n", (unsigned long)this_leaf->object + val); \ + return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ } show_one_plus(level, eax.split.level, 0); @@ -656,7 +662,7 @@ show_one_plus(number_of_sets, ecx.split.number_of_sets, 1); static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf) { - return sprintf (buf, "%luK\n", this_leaf->size / 1024); + return sprintf(buf, "%luK\n", this_leaf->size / 1024); } static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, @@ -669,7 +675,7 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, const struct cpumask *mask; mask = to_cpumask(this_leaf->shared_cpu_map); - n = type? + n = type ? cpulist_scnprintf(buf, len-2, mask) : cpumask_scnprintf(buf, len-2, mask); buf[n++] = '\n'; @@ -800,7 +806,7 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, show_cache_disable_1, store_cache_disable_1); -static struct attribute * default_attrs[] = { +static struct attribute *default_attrs[] = { &type.attr, &level.attr, &coherency_line_size.attr, @@ -815,7 +821,7 @@ static struct attribute * default_attrs[] = { NULL }; -static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) { struct _cache_attr *fattr = to_attr(attr); struct _index_kobject *this_leaf = to_object(kobj); @@ -828,8 +834,8 @@ static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) return ret; } -static ssize_t store(struct kobject * kobj, struct attribute * attr, - const char * buf, size_t count) +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) { struct _cache_attr *fattr = to_attr(attr); struct _index_kobject *this_leaf = to_object(kobj); @@ -883,7 +889,7 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu) goto err_out; per_cpu(index_kobject, cpu) = kzalloc( - sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL); + sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL); if (unlikely(per_cpu(index_kobject, cpu) == NULL)) goto err_out; @@ -917,7 +923,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) } for (i = 0; i < num_cache_leaves; i++) { - this_object = INDEX_KOBJECT_PTR(cpu,i); + this_object = INDEX_KOBJECT_PTR(cpu, i); this_object->cpu = cpu; this_object->index = i; retval = kobject_init_and_add(&(this_object->kobj), @@ -925,9 +931,8 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) per_cpu(cache_kobject, cpu), "index%1lu", i); if (unlikely(retval)) { - for (j = 0; j < i; j++) { - kobject_put(&(INDEX_KOBJECT_PTR(cpu,j)->kobj)); - } + for (j = 0; j < i; j++) + kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj)); kobject_put(per_cpu(cache_kobject, cpu)); cpuid4_cache_sysfs_exit(cpu); return retval; @@ -952,7 +957,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map)); for (i = 0; i < num_cache_leaves; i++) - kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj)); + kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj)); kobject_put(per_cpu(cache_kobject, cpu)); cpuid4_cache_sysfs_exit(cpu); } @@ -977,8 +982,7 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = -{ +static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = { .notifier_call = cacheinfo_cpu_callback, }; diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 5c481f6205bf..8100a29c854f 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -68,16 +68,16 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) /* returns the bit offset of the performance counter register */ switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - return (msr - MSR_K7_PERFCTR0); + return msr - MSR_K7_PERFCTR0; case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return (msr - MSR_ARCH_PERFMON_PERFCTR0); + return msr - MSR_ARCH_PERFMON_PERFCTR0; switch (boot_cpu_data.x86) { case 6: - return (msr - MSR_P6_PERFCTR0); + return msr - MSR_P6_PERFCTR0; case 15: - return (msr - MSR_P4_BPU_PERFCTR0); + return msr - MSR_P4_BPU_PERFCTR0; } } return 0; @@ -92,16 +92,16 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) /* returns the bit offset of the event selection register */ switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - return (msr - MSR_K7_EVNTSEL0); + return msr - MSR_K7_EVNTSEL0; case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return (msr - MSR_ARCH_PERFMON_EVENTSEL0); + return msr - MSR_ARCH_PERFMON_EVENTSEL0; switch (boot_cpu_data.x86) { case 6: - return (msr - MSR_P6_EVNTSEL0); + return msr - MSR_P6_EVNTSEL0; case 15: - return (msr - MSR_P4_BSU_ESCR0); + return msr - MSR_P4_BSU_ESCR0; } } return 0; @@ -113,7 +113,7 @@ int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) { BUG_ON(counter > NMI_MAX_COUNTER_BITS); - return (!test_bit(counter, perfctr_nmi_owner)); + return !test_bit(counter, perfctr_nmi_owner); } /* checks the an msr for availability */ @@ -124,7 +124,7 @@ int avail_to_resrv_perfctr_nmi(unsigned int msr) counter = nmi_perfctr_msr_to_bit(msr); BUG_ON(counter > NMI_MAX_COUNTER_BITS); - return (!test_bit(counter, perfctr_nmi_owner)); + return !test_bit(counter, perfctr_nmi_owner); } EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); @@ -237,7 +237,7 @@ static unsigned int adjust_for_32bit_ctr(unsigned int hz) */ counter_val = (u64)cpu_khz * 1000; do_div(counter_val, retval); - if (counter_val > 0x7fffffffULL) { + if (counter_val > 0x7fffffffULL) { u64 count = (u64)cpu_khz * 1000; do_div(count, 0x7fffffffUL); retval = count + 1; @@ -251,7 +251,7 @@ static void write_watchdog_counter(unsigned int perfctr_msr, u64 count = (u64)cpu_khz * 1000; do_div(count, nmi_hz); - if(descr) + if (descr) pr_debug("setting %s to -0x%08Lx\n", descr, count); wrmsrl(perfctr_msr, 0 - count); } @@ -262,7 +262,7 @@ static void write_watchdog_counter32(unsigned int perfctr_msr, u64 count = (u64)cpu_khz * 1000; do_div(count, nmi_hz); - if(descr) + if (descr) pr_debug("setting %s to -0x%08Lx\n", descr, count); wrmsr(perfctr_msr, (u32)(-count), 0); } @@ -296,7 +296,7 @@ static int setup_k7_watchdog(unsigned nmi_hz) /* setup the timer */ wrmsr(evntsel_msr, evntsel, 0); - write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz); + write_watchdog_counter(perfctr_msr, "K7_PERFCTR0", nmi_hz); /* initialize the wd struct before enabling */ wd->perfctr_msr = perfctr_msr; @@ -387,7 +387,7 @@ static int setup_p6_watchdog(unsigned nmi_hz) /* setup the timer */ wrmsr(evntsel_msr, evntsel, 0); nmi_hz = adjust_for_32bit_ctr(nmi_hz); - write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz); + write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0", nmi_hz); /* initialize the wd struct before enabling */ wd->perfctr_msr = perfctr_msr; @@ -415,7 +415,7 @@ static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) apic_write(APIC_LVTPC, APIC_DM_NMI); /* P6/ARCH_PERFMON has 32 bit counter write */ - write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz); + write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz); } static const struct wd_ops p6_wd_ops = { @@ -490,9 +490,9 @@ static int setup_p4_watchdog(unsigned nmi_hz) if (smp_num_siblings == 2) { unsigned int ebx, apicid; - ebx = cpuid_ebx(1); - apicid = (ebx >> 24) & 0xff; - ht_num = apicid & 1; + ebx = cpuid_ebx(1); + apicid = (ebx >> 24) & 0xff; + ht_num = apicid & 1; } else #endif ht_num = 0; @@ -544,7 +544,7 @@ static int setup_p4_watchdog(unsigned nmi_hz) } evntsel = P4_ESCR_EVENT_SELECT(0x3F) - | P4_ESCR_OS + | P4_ESCR_OS | P4_ESCR_USR; cccr_val |= P4_CCCR_THRESHOLD(15) @@ -612,7 +612,7 @@ static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) { unsigned dummy; /* - * P4 quirks: + * P4 quirks: * - An overflown perfctr will assert its interrupt * until the OVF flag in its CCCR is cleared. * - LVTPC is masked on interrupt and must be @@ -662,7 +662,8 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz) * NOTE: Corresponding bit = 0 in ebx indicates event present. */ cpuid(10, &(eax.full), &ebx, &unused, &unused); - if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || + if ((eax.split.mask_length < + (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) return 0; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index d5e30397246b..1e904346bbf4 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -128,7 +128,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) if (i < ARRAY_SIZE(x86_power_flags) && x86_power_flags[i]) seq_printf(m, "%s%s", - x86_power_flags[i][0]?" ":"", + x86_power_flags[i][0] ? " " : "", x86_power_flags[i]); else seq_printf(m, " [%d]", i); diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 284c399e3234..bc24f514ec93 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -49,17 +49,17 @@ static inline int __vmware_platform(void) static unsigned long __vmware_get_tsc_khz(void) { - uint64_t tsc_hz; - uint32_t eax, ebx, ecx, edx; + uint64_t tsc_hz; + uint32_t eax, ebx, ecx, edx; - VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); + VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); - if (ebx == UINT_MAX) - return 0; - tsc_hz = eax | (((uint64_t)ebx) << 32); - do_div(tsc_hz, 1000); - BUG_ON(tsc_hz >> 32); - return tsc_hz; + if (ebx == UINT_MAX) + return 0; + tsc_hz = eax | (((uint64_t)ebx) << 32); + do_div(tsc_hz, 1000); + BUG_ON(tsc_hz >> 32); + return tsc_hz; } /* -- cgit v1.2.3 From 7fe616c5dd50a50f334edec1ea0580b90b7af0d9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 9 Jul 2009 17:48:28 -0700 Subject: rcu: Simplify RCU CPU-hotplug notification Simplify RCU's CPU-hotplug notifiers so that there is only one notifier. This makes it trivial to provide the ordering guarantee that rcu_barrier() depends on. This change also allows rcu_barrier() to rely on locking wholly contained in _cpu_down(), rather than that of its callers, making the locking more straightforward. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com LKML-Reference: <20090710004828.GA19385@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/rcupdate.c | 8 ++++++-- kernel/rcupreempt.c | 22 +++++----------------- kernel/rcutree.c | 18 +++++++++--------- 3 files changed, 20 insertions(+), 28 deletions(-) diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index eae29c25fb14..3fea910f402c 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -217,9 +217,13 @@ static void rcu_migrate_callback(struct rcu_head *notused) wake_up(&rcu_migrate_wq); } +extern int rcu_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu); + static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, unsigned long action, void *hcpu) { + rcu_cpu_notify(self, action, hcpu); if (action == CPU_DYING) { /* * preempt_disable() in on_each_cpu() prevents stop_machine(), @@ -234,7 +238,7 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, call_rcu_bh(rcu_migrate_head, rcu_migrate_callback); call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback); call_rcu(rcu_migrate_head + 2, rcu_migrate_callback); - } else if (action == CPU_POST_DEAD) { + } else if (action == CPU_DEAD) { /* rcu_migrate_head is protected by cpu_add_remove_lock */ wait_migrated_callbacks(); } @@ -244,8 +248,8 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, void __init rcu_init(void) { - __rcu_init(); hotcpu_notifier(rcu_barrier_cpu_hotplug, 0); + __rcu_init(); } void rcu_scheduler_starting(void) diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index beb0e659adcc..4300212b2107 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -1417,7 +1417,7 @@ int rcu_pending(int cpu) return 0; } -static int __cpuinit rcu_cpu_notify(struct notifier_block *self, +int __cpuinit rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -1439,10 +1439,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata rcu_nb = { - .notifier_call = rcu_cpu_notify, -}; - void __init __rcu_init(void) { int cpu; @@ -1471,22 +1467,14 @@ void __init __rcu_init(void) rdp->waitschedtail = &rdp->waitschedlist; rdp->rcu_sched_sleeping = 0; } - register_cpu_notifier(&rcu_nb); /* - * We don't need protection against CPU-Hotplug here - * since - * a) If a CPU comes online while we are iterating over the - * cpu_online_mask below, we would only end up making a - * duplicate call to rcu_online_cpu() which sets the corresponding - * CPU's mask in the rcu_cpu_online_map. - * - * b) A CPU cannot go offline at this point in time since the user - * does not have access to the sysfs interface, nor do we - * suspend the system. + * We don't need protection against CPU-hotplug here because + * this is called early in boot, before either interrupts + * or the scheduler are operational. */ for_each_online_cpu(cpu) - rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu); + rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long) cpu); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); } diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 7717b95c2027..e8e9e93ed6e1 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1407,13 +1407,12 @@ static void __cpuinit rcu_online_cpu(int cpu) { rcu_init_percpu_data(cpu, &rcu_state); rcu_init_percpu_data(cpu, &rcu_bh_state); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); } /* * Handle CPU online/offline notifcation events. */ -static int __cpuinit rcu_cpu_notify(struct notifier_block *self, +int __cpuinit rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -1523,10 +1522,6 @@ do { \ } \ } while (0) -static struct notifier_block __cpuinitdata rcu_nb = { - .notifier_call = rcu_cpu_notify, -}; - void __init __rcu_init(void) { int i; /* All used by RCU_DATA_PTR_INIT(). */ @@ -1542,10 +1537,15 @@ void __init __rcu_init(void) rcu_init_one(&rcu_bh_state); RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data); + /* + * We don't need protection against CPU-hotplug here because + * this is called early in boot, before either interrupts + * or the scheduler are operational. + */ for_each_online_cpu(i) - rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i); - /* Register notifier for non-boot CPUs */ - register_cpu_notifier(&rcu_nb); + rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i); + + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); } module_param(blimit, int, 0); -- cgit v1.2.3 From 8045a4c293d36c61656a20d581b11f7f0cd7acd5 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 7 Jul 2009 19:30:25 +0200 Subject: x86/oprofile: Fix cast of counter value When casting the counter value to a 64 bit value in 32 bit mode, sign extension may lead to broken counter values. This patch fixes this by casting to (u64) instead of (s64). Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 4 ++-- arch/x86/oprofile/op_model_p4.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index e95268eb9220..7ca8306aefae 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -111,7 +111,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, if (counter_config[i].enabled && msrs->counters[i].addr) { reset_value[i] = counter_config[i].count; wrmsrl(msrs->counters[i].addr, - -(s64)counter_config[i].count); + -(u64)counter_config[i].count); rdmsrl(msrs->controls[i].addr, val); val &= model->reserved; val |= op_x86_get_ctrl(model, &counter_config[i]); @@ -237,7 +237,7 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, if (val & OP_CTR_OVERFLOW) continue; oprofile_add_sample(regs, i); - wrmsrl(msrs->counters[i].addr, -(s64)reset_value[i]); + wrmsrl(msrs->counters[i].addr, -(u64)reset_value[i]); } op_amd_handle_ibs(regs, msrs); diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index f01e53b118fa..9db9e361182c 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -580,7 +580,7 @@ static void p4_setup_ctrs(struct op_x86_model_spec const *model, reset_value[i] = counter_config[i].count; pmc_setup_one_p4_counter(i); wrmsrl(p4_counters[VIRT_CTR(stag, i)].counter_address, - -(s64)counter_config[i].count); + -(u64)counter_config[i].count); } else { reset_value[i] = 0; } @@ -625,11 +625,11 @@ static int p4_check_ctrs(struct pt_regs * const regs, if (CCCR_OVF_P(low) || !(ctr & OP_CTR_OVERFLOW)) { oprofile_add_sample(regs, i); wrmsrl(p4_counters[real].counter_address, - -(s64)reset_value[i]); + -(u64)reset_value[i]); CCCR_CLEAR_OVF(low); wrmsr(p4_counters[real].cccr_address, low, high); wrmsrl(p4_counters[real].counter_address, - -(s64)reset_value[i]); + -(u64)reset_value[i]); } } -- cgit v1.2.3 From 44ab9a6b0e909145d42615493952fe986b1ce5c2 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 9 Jul 2009 18:33:02 +0200 Subject: x86/oprofile: Rework and simplify nmi_cpu_setup() This patch removes the function nmi_save_registers(). Per-cpu code is now executed only in the function nmi_cpu_setup(). Also, it renames the per-cpu function nmi_restore_registers() to nmi_cpu_restore_registers(). Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 93df76dd60f4..25da1e17815d 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -87,13 +87,6 @@ static void nmi_cpu_save_registers(struct op_msrs *msrs) } } -static void nmi_save_registers(void *dummy) -{ - int cpu = smp_processor_id(); - struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); - nmi_cpu_save_registers(msrs); -} - static void free_msrs(void) { int i; @@ -137,6 +130,7 @@ static void nmi_cpu_setup(void *dummy) { int cpu = smp_processor_id(); struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); + nmi_cpu_save_registers(msrs); spin_lock(&oprofilefs_lock); model->setup_ctrs(model, msrs); spin_unlock(&oprofilefs_lock); @@ -182,13 +176,12 @@ static int nmi_setup(void) } } - on_each_cpu(nmi_save_registers, NULL, 1); on_each_cpu(nmi_cpu_setup, NULL, 1); nmi_enabled = 1; return 0; } -static void nmi_restore_registers(struct op_msrs *msrs) +static void nmi_cpu_restore_registers(struct op_msrs *msrs) { struct op_msr *counters = msrs->counters; struct op_msr *controls = msrs->controls; @@ -220,7 +213,7 @@ static void nmi_cpu_shutdown(void *dummy) apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); apic_write(APIC_LVTERR, v); - nmi_restore_registers(msrs); + nmi_cpu_restore_registers(msrs); } static void nmi_shutdown(void) -- cgit v1.2.3 From 6e63ea4b0b14ff5fb8a3ca704fcda7d28b95f079 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 7 Jul 2009 19:25:39 +0200 Subject: x86/oprofile: Whitespaces changes only This patch fixes whitespace changes of code that will be touched in follow-on patches. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 12 ++++++------ arch/x86/oprofile/op_model_amd.c | 12 ++++++------ arch/x86/oprofile/op_model_p4.c | 8 ++++---- arch/x86/oprofile/op_model_ppro.c | 8 ++++---- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 25da1e17815d..fca8dc94531e 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -516,12 +516,12 @@ int __init op_nmi_init(struct oprofile_operations *ops) register_cpu_notifier(&oprofile_cpu_nb); #endif /* default values, can be overwritten by model */ - ops->create_files = nmi_create_files; - ops->setup = nmi_setup; - ops->shutdown = nmi_shutdown; - ops->start = nmi_start; - ops->stop = nmi_stop; - ops->cpu_type = cpu_type; + ops->create_files = nmi_create_files; + ops->setup = nmi_setup; + ops->shutdown = nmi_shutdown; + ops->start = nmi_start; + ops->stop = nmi_stop; + ops->cpu_type = cpu_type; if (model->init) ret = model->init(ops); diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 7ca8306aefae..f676f8825a3f 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -91,7 +91,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, int i; /* clear all counters */ - for (i = 0 ; i < NUM_CONTROLS; ++i) { + for (i = 0; i < NUM_CONTROLS; ++i) { if (unlikely(!msrs->controls[i].addr)) continue; rdmsrl(msrs->controls[i].addr, val); @@ -229,7 +229,7 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, u64 val; int i; - for (i = 0 ; i < NUM_COUNTERS; ++i) { + for (i = 0; i < NUM_COUNTERS; ++i) { if (!reset_value[i]) continue; rdmsrl(msrs->counters[i].addr, val); @@ -250,7 +250,7 @@ static void op_amd_start(struct op_msrs const * const msrs) { u64 val; int i; - for (i = 0 ; i < NUM_COUNTERS ; ++i) { + for (i = 0; i < NUM_COUNTERS; ++i) { if (reset_value[i]) { rdmsrl(msrs->controls[i].addr, val); val |= ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -270,7 +270,7 @@ static void op_amd_stop(struct op_msrs const * const msrs) * Subtle: stop on all counters to avoid race with setting our * pm callback */ - for (i = 0 ; i < NUM_COUNTERS ; ++i) { + for (i = 0; i < NUM_COUNTERS; ++i) { if (!reset_value[i]) continue; rdmsrl(msrs->controls[i].addr, val); @@ -285,11 +285,11 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) { int i; - for (i = 0 ; i < NUM_COUNTERS ; ++i) { + for (i = 0; i < NUM_COUNTERS; ++i) { if (msrs->counters[i].addr) release_perfctr_nmi(MSR_K7_PERFCTR0 + i); } - for (i = 0 ; i < NUM_CONTROLS ; ++i) { + for (i = 0; i < NUM_CONTROLS; ++i) { if (msrs->controls[i].addr) release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); } diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 9db9e361182c..5921b7fc724b 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -558,7 +558,7 @@ static void p4_setup_ctrs(struct op_x86_model_spec const *model, } /* clear the cccrs we will use */ - for (i = 0 ; i < num_counters ; i++) { + for (i = 0; i < num_counters; i++) { if (unlikely(!msrs->controls[i].addr)) continue; rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); @@ -575,7 +575,7 @@ static void p4_setup_ctrs(struct op_x86_model_spec const *model, } /* setup all counters */ - for (i = 0 ; i < num_counters ; ++i) { + for (i = 0; i < num_counters; ++i) { if (counter_config[i].enabled && msrs->controls[i].addr) { reset_value[i] = counter_config[i].count; pmc_setup_one_p4_counter(i); @@ -678,7 +678,7 @@ static void p4_shutdown(struct op_msrs const * const msrs) { int i; - for (i = 0 ; i < num_counters ; ++i) { + for (i = 0; i < num_counters; ++i) { if (msrs->counters[i].addr) release_perfctr_nmi(msrs->counters[i].addr); } @@ -687,7 +687,7 @@ static void p4_shutdown(struct op_msrs const * const msrs) * conjunction with the counter registers (hence the starting offset). * This saves a few bits. */ - for (i = num_counters ; i < num_controls ; ++i) { + for (i = num_counters; i < num_controls; ++i) { if (msrs->controls[i].addr) release_evntsel_nmi(msrs->controls[i].addr); } diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index cd72d5c73b49..570d717c3308 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -81,7 +81,7 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, } /* clear all counters */ - for (i = 0 ; i < num_counters; ++i) { + for (i = 0; i < num_counters; ++i) { if (unlikely(!msrs->controls[i].addr)) continue; rdmsrl(msrs->controls[i].addr, val); @@ -125,7 +125,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs, if (unlikely(!reset_value)) goto out; - for (i = 0 ; i < num_counters; ++i) { + for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; rdmsrl(msrs->counters[i].addr, val); @@ -188,11 +188,11 @@ static void ppro_shutdown(struct op_msrs const * const msrs) { int i; - for (i = 0 ; i < num_counters ; ++i) { + for (i = 0; i < num_counters; ++i) { if (msrs->counters[i].addr) release_perfctr_nmi(MSR_P6_PERFCTR0 + i); } - for (i = 0 ; i < num_counters ; ++i) { + for (i = 0; i < num_counters; ++i) { if (msrs->controls[i].addr) release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); } -- cgit v1.2.3 From 6b2b171a774af256082635b53ac387b1613b7b4c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 8 Jun 2009 02:53:50 -0700 Subject: x86/acpi: acpi_parse_madt_ioapic_entries: remove redundant braces We don't put braces around a single statement. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/acpi/boot.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 6b8ca3a0285d..ce31c1af854f 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1179,9 +1179,8 @@ static int __init acpi_parse_madt_ioapic_entries(void) * If MPS is present, it will handle them, * otherwise the system will stay in PIC mode */ - if (acpi_disabled || acpi_noirq) { + if (acpi_disabled || acpi_noirq) return -ENODEV; - } if (!cpu_has_apic) return -ENODEV; -- cgit v1.2.3 From 2f210deba9887dd9143b63b217506f1ac152e91c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 8 Jun 2009 02:55:22 -0700 Subject: x86/ioapic.c: ioapic_modify_irq is too large to inline If ioapic_modify_irq() is marked inline, it gets inlined several times. Un-inlining it saves around 200 bytes in .text for me. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/apic/io_apic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 90b5e6efa938..82271eb87bb0 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -556,9 +556,9 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, add_pin_to_irq_node(cfg, node, newapic, newpin); } -static inline void io_apic_modify_irq(struct irq_cfg *cfg, - int mask_and, int mask_or, - void (*final)(struct irq_pin_list *entry)) +static void io_apic_modify_irq(struct irq_cfg *cfg, + int mask_and, int mask_or, + void (*final)(struct irq_pin_list *entry)) { int pin; struct irq_pin_list *entry; -- cgit v1.2.3 From 890aeacf64c55a7ada7054a140d249ab13899f2d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 8 Jun 2009 02:57:43 -0700 Subject: x86/ioapic.c: unify __mask_IO_APIC_irq() The main difference between 32 and 64-bit __mask_IO_APIC_irq() does a readback from the I/O APIC to synchronize it. If there's a hardware requirement to do a readback sync after updating an APIC register, then it will be a hardware requrement regardless of whether the kernel is compiled 32 or 64-bit. Unify __mask_IO_APIC_irq() using the 64-bit version which always syncs with io_apic_sync(). Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/apic/io_apic.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 82271eb87bb0..f8aa5461071b 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -580,7 +580,6 @@ static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); } -#ifdef CONFIG_X86_64 static void io_apic_sync(struct irq_pin_list *entry) { /* @@ -596,12 +595,8 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg) { io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); } -#else /* CONFIG_X86_32 */ -static void __mask_IO_APIC_irq(struct irq_cfg *cfg) -{ - io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL); -} +#ifdef CONFIG_X86_32 static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) { io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER, -- cgit v1.2.3 From 916a0fe739f151664f7f07b42543ae6fd4caec49 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 8 Jun 2009 03:00:22 -0700 Subject: x86/ioapic.c: remove #ifdef for 82093AA workaround While no 64-bit hardware will have a version 0x11 I/O APIC which needs the level/edge bug workaround, that's not a particular reason to use CONFIG_X86_32 to #ifdef the code out. Most 32-bit machines will no longer need the workaround either, so the test to see whether it is necessary should be more fine-grained than "32-bit=yes, 64-bit=no". (Also fix formatting of block comment.) Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/apic/io_apic.c | 47 +++++++++++++++++------------------------- 1 file changed, 19 insertions(+), 28 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index f8aa5461071b..1a3414442583 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -596,7 +596,6 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg) io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); } -#ifdef CONFIG_X86_32 static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) { io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER, @@ -608,7 +607,6 @@ static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg) io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, IO_APIC_REDIR_LEVEL_TRIGGER, NULL); } -#endif /* CONFIG_X86_32 */ static void mask_IO_APIC_irq_desc(struct irq_desc *desc) { @@ -2510,11 +2508,8 @@ atomic_t irq_mis_count; static void ack_apic_level(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - -#ifdef CONFIG_X86_32 unsigned long v; int i; -#endif struct irq_cfg *cfg; int do_unmask_irq = 0; @@ -2527,31 +2522,28 @@ static void ack_apic_level(unsigned int irq) } #endif -#ifdef CONFIG_X86_32 /* - * It appears there is an erratum which affects at least version 0x11 - * of I/O APIC (that's the 82093AA and cores integrated into various - * chipsets). Under certain conditions a level-triggered interrupt is - * erroneously delivered as edge-triggered one but the respective IRR - * bit gets set nevertheless. As a result the I/O unit expects an EOI - * message but it will never arrive and further interrupts are blocked - * from the source. The exact reason is so far unknown, but the - * phenomenon was observed when two consecutive interrupt requests - * from a given source get delivered to the same CPU and the source is - * temporarily disabled in between. - * - * A workaround is to simulate an EOI message manually. We achieve it - * by setting the trigger mode to edge and then to level when the edge - * trigger mode gets detected in the TMR of a local APIC for a - * level-triggered interrupt. We mask the source for the time of the - * operation to prevent an edge-triggered interrupt escaping meanwhile. - * The idea is from Manfred Spraul. --macro - */ + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + */ cfg = desc->chip_data; i = cfg->vector; - v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); -#endif /* * We must acknowledge the irq before we move it or the acknowledge will @@ -2593,7 +2585,7 @@ static void ack_apic_level(unsigned int irq) unmask_IO_APIC_irq_desc(desc); } -#ifdef CONFIG_X86_32 + /* Tail end of version 0x11 I/O APIC bug workaround */ if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); spin_lock(&ioapic_lock); @@ -2601,7 +2593,6 @@ static void ack_apic_level(unsigned int irq) __unmask_and_level_IO_APIC_irq(cfg); spin_unlock(&ioapic_lock); } -#endif } #ifdef CONFIG_INTR_REMAP -- cgit v1.2.3 From 83c21bedf63ce92a2dd82ae2c7a96179b0aa4372 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 8 Jun 2009 03:13:04 -0700 Subject: x86/ioapic.c: remove redundant declaration of irq_pin_list The structure is defined immediately below, so there's no need to forward declare it. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/apic/io_apic.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1a3414442583..ec52e0c045c4 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -116,8 +116,6 @@ static int __init parse_noapic(char *str) } early_param("noapic", parse_noapic); -struct irq_pin_list; - /* * This is performance-critical, we want to do it O(1) * -- cgit v1.2.3 From 8e13d697febc1ba17e70ed88789255c8bc25aa41 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 8 Jun 2009 03:14:59 -0700 Subject: x86/ioapic.c: move lost comment to what seems like appropriate place The comment got separated from its subject, so move it to what appears to be the right place, and update to describe the current structure. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/apic/io_apic.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ec52e0c045c4..a097a773bc76 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -116,13 +116,6 @@ static int __init parse_noapic(char *str) } early_param("noapic", parse_noapic); -/* - * This is performance-critical, we want to do it O(1) - * - * the indexing order of this array favors 1:1 mappings - * between pins and IRQs. - */ - struct irq_pin_list { int apic, pin; struct irq_pin_list *next; @@ -137,6 +130,11 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node) return pin; } +/* + * This is performance-critical, we want to do it O(1) + * + * Most irqs are mapped 1:1 with pins. + */ struct irq_cfg { struct irq_pin_list *irq_2_pin; cpumask_var_t domain; -- cgit v1.2.3 From d8c52063ed85dda61b70bc05b90711478db5dc17 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 8 Jun 2009 03:17:58 -0700 Subject: x86/ioapic.c: convert io_apic_level_ack_pending loop to normal for() loop Convert the unconventional loop in io_apic_level_ack_pending() to a conventional for() loop. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/apic/io_apic.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index a097a773bc76..0d0401802d4f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -410,13 +410,10 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); - entry = cfg->irq_2_pin; - for (;;) { + for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { unsigned int reg; int pin; - if (!entry) - break; pin = entry->pin; reg = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ @@ -424,9 +421,6 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) spin_unlock_irqrestore(&ioapic_lock, flags); return true; } - if (!entry->next) - break; - entry = entry->next; } spin_unlock_irqrestore(&ioapic_lock, flags); -- cgit v1.2.3 From 875e68ec32fc5495f3edf987aaae1c52306184b7 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 8 Jun 2009 03:24:11 -0700 Subject: x86/ioapic.c: simplify add_pin_to_irq_node() Rather than duplicating the same alloc/init code twice, restructure the function to look for duplicates and then add an entry if none is found. This function is not performance critical; all but one of its callers are __init functions, and the non-__init caller is for PCI device setup. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/apic/io_apic.c | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 0d0401802d4f..d9e8f19088d4 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -490,34 +490,22 @@ static void ioapic_mask_entry(int apic, int pin) */ static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) { - struct irq_pin_list *entry; + struct irq_pin_list **entryp, *entry; - entry = cfg->irq_2_pin; - if (!entry) { - entry = get_one_free_irq_2_pin(node); - if (!entry) { - printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", - apic, pin); - return; - } - cfg->irq_2_pin = entry; - entry->apic = apic; - entry->pin = pin; - return; - } - - while (entry->next) { + for (entryp = &cfg->irq_2_pin; + *entryp != NULL; + entryp = &(*entryp)->next) { + entry = *entryp; /* not again, please */ if (entry->apic == apic && entry->pin == pin) return; - - entry = entry->next; } - entry->next = get_one_free_irq_2_pin(node); - entry = entry->next; + entry = get_one_free_irq_2_pin(node); entry->apic = apic; entry->pin = pin; + + *entryp = entry; } /* -- cgit v1.2.3 From 535b64291a9d1ff8bc54642494a5fce27e1e1170 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 8 Jun 2009 03:29:26 -0700 Subject: x86/ioapic.c: convert replace_pin_at_irq_node to conventional for() loop Use a conventional for() loop in replace_pin_at_irq_node(). Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/apic/io_apic.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d9e8f19088d4..9386976b675b 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -515,10 +515,10 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, int oldapic, int oldpin, int newapic, int newpin) { - struct irq_pin_list *entry = cfg->irq_2_pin; + struct irq_pin_list *entry; int replaced = 0; - while (entry) { + for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { if (entry->apic == oldapic && entry->pin == oldpin) { entry->apic = newapic; entry->pin = newpin; @@ -526,7 +526,6 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, /* every one is different, right? */ break; } - entry = entry->next; } /* why? call replace before add? */ -- cgit v1.2.3 From 4eea6fff612f54380dd642b045bf03ac0613fe3e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 8 Jun 2009 03:32:15 -0700 Subject: x86/ioapic.c: clean up replace_pin_at_irq_node logic and comments There's no need for a control variable in replace_pin_at_irq_node(); it can just return if it finds the old apic/pin to replace. If the loop terminates, then it didn't find the old apic/pin, so it can add the new ones. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/apic/io_apic.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 9386976b675b..8245e62ed93f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -512,25 +512,22 @@ static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin * Reroute an IRQ to a different pin. */ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, - int oldapic, int oldpin, - int newapic, int newpin) + int oldapic, int oldpin, + int newapic, int newpin) { struct irq_pin_list *entry; - int replaced = 0; for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { if (entry->apic == oldapic && entry->pin == oldpin) { entry->apic = newapic; entry->pin = newpin; - replaced = 1; /* every one is different, right? */ - break; + return; } } - /* why? call replace before add? */ - if (!replaced) - add_pin_to_irq_node(cfg, node, newapic, newpin); + /* old apic/pin didn't exist, so just add new ones */ + add_pin_to_irq_node(cfg, node, newapic, newpin); } static void io_apic_modify_irq(struct irq_cfg *cfg, -- cgit v1.2.3 From 638f2f8c52a92c15ebda9e50d84c1ab56fc42e42 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 8 Jun 2009 03:37:52 -0700 Subject: x86/ioapic.c: convert __target_IO_APIC_irq to conventional for() loop Use a normal for() loop in __target_IO_APIC_irq(). Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/apic/io_apic.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 8245e62ed93f..17883cd82592 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2236,13 +2236,9 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq struct irq_pin_list *entry; u8 vector = cfg->vector; - entry = cfg->irq_2_pin; - for (;;) { + for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { unsigned int reg; - if (!entry) - break; - apic = entry->apic; pin = entry->pin; /* @@ -2255,9 +2251,6 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq reg &= ~IO_APIC_REDIR_VECTOR_MASK; reg |= vector; io_apic_modify(apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = entry->next; } } -- cgit v1.2.3 From e25371d60cb06a44d7a32d7966ab9bfbeacb9390 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 8 Jun 2009 03:49:01 -0700 Subject: x86/ioapic.c: unify ioapic_retrigger_irq() The 32 and 64-bit versions of ioapic_retrigger_irq() are identical except the 64-bit one takes vector_lock. vector_lock is defined and used on 32-bit too, so just use a common ioapic_retrigger_irq(). Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/apic/io_apic.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 17883cd82592..cf51b0b58c56 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2178,7 +2178,6 @@ static unsigned int startup_ioapic_irq(unsigned int irq) return was_pending; } -#ifdef CONFIG_X86_64 static int ioapic_retrigger_irq(unsigned int irq) { @@ -2191,14 +2190,6 @@ static int ioapic_retrigger_irq(unsigned int irq) return 1; } -#else -static int ioapic_retrigger_irq(unsigned int irq) -{ - apic->send_IPI_self(irq_cfg(irq)->vector); - - return 1; -} -#endif /* * Level and edge triggered IO-APIC interrupts need different handling, -- cgit v1.2.3 From da706d8bc833e7153622435560422e653bdb2e94 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 15 Jul 2009 16:27:30 +0800 Subject: ring_buffer: Fix warning while ignoring cmpxchg return value kernel/trace/ring_buffer.c: In function 'rb_tail_page_update': kernel/trace/ring_buffer.c:849: warning: value computed is not used kernel/trace/ring_buffer.c:850: warning: value computed is not used Add "(void)"s to fix this warning, because we don't need here to handle the fail case of cmpxchg, it's fine if an interrupt already did the job. Changed from V1: Add a comment(which is written by Steven) for it. Signed-off-by: Lai Jiangshan Acked-by: Steven Rostedt Signed-off-by: Frederic Weisbecker --- kernel/trace/ring_buffer.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e648ba4f70e0..51633d74a21e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -845,9 +845,14 @@ static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, * This will only succeed if an interrupt did * not come in and change it. In which case, we * do not want to modify it. + * + * We add (void) to let the compiler know that we do not care + * about the return value of these functions. We use the + * cmpxchg to only update if an interrupt did not already + * do it for us. If the cmpxchg fails, we don't care. */ - local_cmpxchg(&next_page->write, old_write, val); - local_cmpxchg(&next_page->entries, old_entries, eval); + (void)local_cmpxchg(&next_page->write, old_write, val); + (void)local_cmpxchg(&next_page->entries, old_entries, eval); /* * No need to worry about races with clearing out the commit. -- cgit v1.2.3 From 64fbcd162819bddaf0d99e78b16371b655aa5dee Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 15 Jul 2009 12:32:15 +0800 Subject: tracing/function: Simplify __ftrace_replace_code() Rewrite the __ftrace_replace_code() function, simplify it, but don't change the code's logic. First, we get the state we want to set, if the record has the same state, then do nothing, otherwise enable/disable it. Signed-off-by: Xiao Guangrong Reviewed-by: Li Zefan Signed-off-by: Frederic Weisbecker --- kernel/trace/ftrace.c | 72 +++++++++++++-------------------------------------- 1 file changed, 18 insertions(+), 54 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bce9e01a29c8..217caeca71cd 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1017,71 +1017,35 @@ static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable) { unsigned long ftrace_addr; - unsigned long ip, fl; + unsigned long flag = 0UL; ftrace_addr = (unsigned long)FTRACE_ADDR; - ip = rec->ip; - /* - * If this record is not to be traced and - * it is not enabled then do nothing. + * If this record is not to be traced or we want to disable it, + * then disable it. * - * If this record is not to be traced and - * it is enabled then disable it. + * If we want to enable it and filtering is off, then enable it. * + * If we want to enable it and filtering is on, enable it only if + * it's filtered */ - if (rec->flags & FTRACE_FL_NOTRACE) { - if (rec->flags & FTRACE_FL_ENABLED) - rec->flags &= ~FTRACE_FL_ENABLED; - else - return 0; - - } else if (ftrace_filtered && enable) { - /* - * Filtering is on: - */ - - fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED); - - /* Record is filtered and enabled, do nothing */ - if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) - return 0; - - /* Record is not filtered or enabled, do nothing */ - if (!fl) - return 0; - - /* Record is not filtered but enabled, disable it */ - if (fl == FTRACE_FL_ENABLED) - rec->flags &= ~FTRACE_FL_ENABLED; - else - /* Otherwise record is filtered but not enabled, enable it */ - rec->flags |= FTRACE_FL_ENABLED; - } else { - /* Disable or not filtered */ - - if (enable) { - /* if record is enabled, do nothing */ - if (rec->flags & FTRACE_FL_ENABLED) - return 0; - - rec->flags |= FTRACE_FL_ENABLED; - - } else { + if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) { + if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER)) + flag = FTRACE_FL_ENABLED; + } - /* if record is not enabled, do nothing */ - if (!(rec->flags & FTRACE_FL_ENABLED)) - return 0; + /* If the state of this record hasn't changed, then do nothing */ + if ((rec->flags & FTRACE_FL_ENABLED) == flag) + return 0; - rec->flags &= ~FTRACE_FL_ENABLED; - } + if (flag) { + rec->flags |= FTRACE_FL_ENABLED; + return ftrace_make_call(rec, ftrace_addr); } - if (rec->flags & FTRACE_FL_ENABLED) - return ftrace_make_call(rec, ftrace_addr); - else - return ftrace_make_nop(NULL, rec, ftrace_addr); + rec->flags &= ~FTRACE_FL_ENABLED; + return ftrace_make_nop(NULL, rec, ftrace_addr); } static void ftrace_replace_code(int enable) -- cgit v1.2.3 From 79173bf556417a737e9d2e096e0788452ec30a61 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 16 Jul 2009 14:17:11 +0800 Subject: tracing/trace_stack: Cleanup for trace_lookup_stack() We can directly use %pF input format instead of sprint_symbol() and %s input format. Signed-off-by: Xiao Guangrong Reviewed-by: Li Zefan Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_stack.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index e644af910124..a4dc8d9ad1b1 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -234,15 +234,8 @@ static void t_stop(struct seq_file *m, void *p) static int trace_lookup_stack(struct seq_file *m, long i) { unsigned long addr = stack_dump_trace[i]; -#ifdef CONFIG_KALLSYMS - char str[KSYM_SYMBOL_LEN]; - sprint_symbol(str, addr); - - return seq_printf(m, "%s\n", str); -#else - return seq_printf(m, "%p\n", (void*)addr); -#endif + return seq_printf(m, "%pF\n", (void *)addr); } static void print_disabled(struct seq_file *m) -- cgit v1.2.3 From 6f2f3cf00ee32f75ba007a46bab88a54d68a5deb Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 16 Jul 2009 14:21:08 +0800 Subject: tracing/function: Cleanup for function tracer We can directly use %pf input format instead of kallsyms_lookup() and %s input format Signed-off-by: Xiao Guangrong Reviewed-by: Li Zefan Signed-off-by: Frederic Weisbecker --- kernel/trace/ftrace.c | 17 +++-------------- kernel/trace/trace_functions.c | 4 +--- 2 files changed, 4 insertions(+), 17 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 217caeca71cd..80a97a51442d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1403,18 +1403,13 @@ static int t_hash_show(struct seq_file *m, void *v) { struct ftrace_func_probe *rec; struct hlist_node *hnd = v; - char str[KSYM_SYMBOL_LEN]; rec = hlist_entry(hnd, struct ftrace_func_probe, node); if (rec->ops->print) return rec->ops->print(m, rec->ip, rec->ops, rec->data); - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); - seq_printf(m, "%s:", str); - - kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str); - seq_printf(m, "%s", str); + seq_printf(m, "%pf:%pf", (void *)rec->ip, (void *)rec->ops->func); if (rec->data) seq_printf(m, ":%p", rec->data); @@ -1512,7 +1507,6 @@ static int t_show(struct seq_file *m, void *v) { struct ftrace_iterator *iter = m->private; struct dyn_ftrace *rec = v; - char str[KSYM_SYMBOL_LEN]; if (iter->flags & FTRACE_ITER_HASH) return t_hash_show(m, v); @@ -1525,9 +1519,7 @@ static int t_show(struct seq_file *m, void *v) if (!rec) return 0; - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); - - seq_printf(m, "%s\n", str); + seq_printf(m, "%pf\n", (void *)rec->ip); return 0; } @@ -2508,7 +2500,6 @@ static void g_stop(struct seq_file *m, void *p) static int g_show(struct seq_file *m, void *v) { unsigned long *ptr = v; - char str[KSYM_SYMBOL_LEN]; if (!ptr) return 0; @@ -2518,9 +2509,7 @@ static int g_show(struct seq_file *m, void *v) return 0; } - kallsyms_lookup(*ptr, NULL, NULL, NULL, str); - - seq_printf(m, "%s\n", str); + seq_printf(m, "%pf\n", v); return 0; } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 7402144bff21..b53dc994dfb6 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -288,11 +288,9 @@ static int ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - char str[KSYM_SYMBOL_LEN]; long count = (long)data; - kallsyms_lookup(ip, NULL, NULL, NULL, str); - seq_printf(m, "%s:", str); + seq_printf(m, "%pf:", (void *)ip); if (ops == &traceon_probe_ops) seq_printf(m, "traceon"); -- cgit v1.2.3 From 566b0aaf798a0dddfc455d1a5b05c424c6686c65 Mon Sep 17 00:00:00 2001 From: "jolsa@redhat.com" Date: Thu, 16 Jul 2009 21:44:26 +0200 Subject: tracing: Remove unused fields/variables Signed-off-by: Jiri Olsa Cc: rostedt@goodmis.org LKML-Reference: <1247773468-11594-2-git-send-email-jolsa@redhat.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 3 --- kernel/trace/trace.c | 3 +-- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6227dc806377..24e3ff53b24b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1339,7 +1339,6 @@ struct ftrace_iterator { unsigned flags; unsigned char buffer[FTRACE_BUFF_MAX+1]; unsigned buffer_idx; - unsigned filtered; }; static void * @@ -2268,7 +2267,6 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, } if (isspace(ch)) { - iter->filtered++; iter->buffer[iter->buffer_idx] = 0; ret = ftrace_process_regex(iter->buffer, iter->buffer_idx, enable); @@ -2399,7 +2397,6 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) iter = file->private_data; if (iter->buffer_idx) { - iter->filtered++; iter->buffer[iter->buffer_idx] = 0; ftrace_match_records(iter->buffer, iter->buffer_idx, enable); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 60a49488b791..e30e6b1dbd4e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4265,7 +4265,6 @@ void ftrace_dump(void) __init static int tracer_alloc_buffers(void) { - struct trace_array_cpu *data; int ring_buf_size; int i; int ret = -ENOMEM; @@ -4315,7 +4314,7 @@ __init static int tracer_alloc_buffers(void) /* Allocate the first page for all buffers */ for_each_tracing_cpu(i) { - data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); + global_trace.data[i] = &per_cpu(global_trace_cpu, i); max_tr.data[i] = &per_cpu(max_data, i); } -- cgit v1.2.3 From d34a4debef933061924ee17c2ede33f5c44925fb Mon Sep 17 00:00:00 2001 From: "jolsa@redhat.com" Date: Thu, 16 Jul 2009 21:44:28 +0200 Subject: tracing: Remove .globl in the scripts/recordmcount.pl doc I was reading throught the recordmcount.pl starting comment, and spotted a tiny discrepancy. The second example is about my_func not being global, but the example code has the ".globl my_func" statement just moved. Signed-off-by: Jiri Olsa Cc: rostedt@goodmis.org LKML-Reference: <1247773468-11594-4-git-send-email-jolsa@redhat.com> Signed-off-by: Ingo Molnar --- scripts/recordmcount.pl | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 7109e2b5bc0a..db4ebe1b9960 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -57,7 +57,6 @@ # call mcount (offset: 0x5) # [...] # ret -# .globl my_func # other_func: # [...] # call mcount (offset: 0x1b) -- cgit v1.2.3 From e7aaaa6934636d7a6cadd9e2a05250fbb6a34f65 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: sched: Drop the need_resched() loop from cond_resched() The schedule() function is a loop that reschedules the current task while the TIF_NEED_RESCHED flag is set: void schedule(void) { need_resched: /* schedule code */ if (need_resched()) goto need_resched; } And cond_resched() repeat this loop: do { add_preempt_count(PREEMPT_ACTIVE); schedule(); sub_preempt_count(PREEMPT_ACTIVE); } while(need_resched()); This loop is needless because schedule() already did the check and nothing can set TIF_NEED_RESCHED between schedule() exit and the loop check in need_resched(). Then remove this needless loop. Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 03f7e3fd80b5..4c5ee843d57f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6618,11 +6618,9 @@ static void __cond_resched(void) * PREEMPT_ACTIVE, which could trigger a second * cond_resched() call. */ - do { - add_preempt_count(PREEMPT_ACTIVE); - schedule(); - sub_preempt_count(PREEMPT_ACTIVE); - } while (need_resched()); + add_preempt_count(PREEMPT_ACTIVE); + schedule(); + sub_preempt_count(PREEMPT_ACTIVE); } int __sched _cond_resched(void) -- cgit v1.2.3 From 4b2155678d7cc7b5f45d6b36049091376c3408a2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: sched: Remove obsolete comment in __cond_resched() Remove the outdated comment from __cond_resched() related to the now removed Big Kernel Semaphore. Reported-by: Arnd Bergmann Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 4c5ee843d57f..4d39e96044b9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6613,11 +6613,6 @@ static void __cond_resched(void) #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP __might_sleep(__FILE__, __LINE__); #endif - /* - * The BKS might be reacquired before we have dropped - * PREEMPT_ACTIVE, which could trigger a second - * cond_resched() call. - */ add_preempt_count(PREEMPT_ACTIVE); schedule(); sub_preempt_count(PREEMPT_ACTIVE); -- cgit v1.2.3 From e09758fae8ccde97e026c704319eaa18d488dc86 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: sched: Cover the CONFIG_DEBUG_SPINLOCK_SLEEP off-case for __might_sleep() Cover the off case for __might_sleep(), so that we avoid #ifdefs in files that make use of it. Especially, this prepares for the __might_sleep() pull up on cond_resched(). Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-3-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 1 + kernel/sched.c | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d6320a3e8def..b804f694ae6a 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -139,6 +139,7 @@ extern int _cond_resched(void); # define might_sleep() \ do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) #else + static inline void __might_sleep(char *file, int line) { } # define might_sleep() do { might_resched(); } while (0) #endif diff --git a/kernel/sched.c b/kernel/sched.c index 4d39e96044b9..370a6c31c5e1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6610,9 +6610,8 @@ static inline int should_resched(void) static void __cond_resched(void) { -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP __might_sleep(__FILE__, __LINE__); -#endif + add_preempt_count(PREEMPT_ACTIVE); schedule(); sub_preempt_count(PREEMPT_ACTIVE); -- cgit v1.2.3 From e4aafea2d4bde8b44d6500c4ee7195bbfc51269e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: sched: Add a preempt count base offset to __might_sleep() Add a preempt count base offset to compare against the current preempt level count. It prepares to pull up the might_sleep check from cond_resched() to cond_resched_lock() and cond_resched_bh(). For these two helpers, we need to respectively ensure that once we'll unlock the given spinlock / reenable local softirqs, we will reach a sleepable state. Signed-off-by: Frederic Weisbecker [ Move and rename preempt_count_equals() ] Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-4-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 6 +++--- kernel/sched.c | 15 +++++++++++---- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index b804f694ae6a..2b5b1e0899a8 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -125,7 +125,7 @@ extern int _cond_resched(void); #endif #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP - void __might_sleep(char *file, int line); + void __might_sleep(char *file, int line, int preempt_offset); /** * might_sleep - annotation for functions that can sleep * @@ -137,9 +137,9 @@ extern int _cond_resched(void); * supposed to. */ # define might_sleep() \ - do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) + do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) #else - static inline void __might_sleep(char *file, int line) { } + static inline void __might_sleep(char *file, int line, int preempt_offset) { } # define might_sleep() do { might_resched(); } while (0) #endif diff --git a/kernel/sched.c b/kernel/sched.c index 370a6c31c5e1..3ff4d004bd95 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6610,7 +6610,7 @@ static inline int should_resched(void) static void __cond_resched(void) { - __might_sleep(__FILE__, __LINE__); + __might_sleep(__FILE__, __LINE__, 0); add_preempt_count(PREEMPT_ACTIVE); schedule(); @@ -9429,13 +9429,20 @@ void __init sched_init(void) } #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP -void __might_sleep(char *file, int line) +static inline int preempt_count_equals(int preempt_offset) +{ + int nested = preempt_count() & ~PREEMPT_ACTIVE; + + return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); +} + +void __might_sleep(char *file, int line, int preempt_offset) { #ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ - if ((!in_atomic() && !irqs_disabled()) || - system_state != SYSTEM_RUNNING || oops_in_progress) + if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || + system_state != SYSTEM_RUNNING || oops_in_progress) return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; -- cgit v1.2.3 From 6f80bd985fe242c2e6a8b6209ed20b0495d3d63b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: sched: Remove the CONFIG_PREEMPT_BKL case definition of cond_resched() CONFIG_PREEMPT_BKL doesn't exist anymore. So remove this config-on case definition of cond_resched(). Reported-by: Peter Zijlstra Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-5-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 9bada20e2b23..e2bdf18e05c4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2285,17 +2285,12 @@ static inline int need_resched(void) * cond_resched_softirq() will enable bhs before scheduling. */ extern int _cond_resched(void); -#ifdef CONFIG_PREEMPT_BKL -static inline int cond_resched(void) -{ - return 0; -} -#else + static inline int cond_resched(void) { return _cond_resched(); } -#endif + extern int cond_resched_lock(spinlock_t * lock); extern int cond_resched_softirq(void); static inline int cond_resched_bkl(void) -- cgit v1.2.3 From 613afbf83298efaead05ebcac23d2285609d7160 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: sched: Pull up the might_sleep() check into cond_resched() might_sleep() is called late-ish in cond_resched(), after the need_resched()/preempt enabled/system running tests are checked. It's better to check the sleeps while atomic earlier and not depend on some environment datas that reduce the chances to detect a problem. Also define cond_resched_*() helpers as macros, so that the FILE/LINE reported in the sleeping while atomic warning displays the real origin and not sched.h Changes in v2: - Call __might_sleep() directly instead of might_sleep() which may call cond_resched() - Turn cond_resched() into a macro so that the file:line couple reported refers to the caller of cond_resched() and not __cond_resched() itself. Changes in v3: - Also propagate this __might_sleep() pull up to cond_resched_lock() and cond_resched_softirq() Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-6-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- fs/dcache.c | 1 + include/linux/sched.h | 29 +++++++++++++++++++---------- kernel/sched.c | 12 +++++------- 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 9e5cd3c3a6ba..a100fa35a48f 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "internal.h" int sysctl_vfs_cache_pressure __read_mostly = 100; diff --git a/include/linux/sched.h b/include/linux/sched.h index e2bdf18e05c4..c41d424db887 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2286,17 +2286,26 @@ static inline int need_resched(void) */ extern int _cond_resched(void); -static inline int cond_resched(void) -{ - return _cond_resched(); -} +#define cond_resched() ({ \ + __might_sleep(__FILE__, __LINE__, 0); \ + _cond_resched(); \ +}) -extern int cond_resched_lock(spinlock_t * lock); -extern int cond_resched_softirq(void); -static inline int cond_resched_bkl(void) -{ - return _cond_resched(); -} +extern int __cond_resched_lock(spinlock_t *lock); + +#define cond_resched_lock(lock) ({ \ + __might_sleep(__FILE__, __LINE__, PREEMPT_OFFSET); \ + __cond_resched_lock(lock); \ +}) + +extern int __cond_resched_softirq(void); + +#define cond_resched_softirq() ({ \ + __might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET); \ + __cond_resched_softirq(); \ +}) + +#define cond_resched_bkl() cond_resched() /* * Does a critical section need to be broken due to another diff --git a/kernel/sched.c b/kernel/sched.c index 3ff4d004bd95..1f7919add8ae 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6610,8 +6610,6 @@ static inline int should_resched(void) static void __cond_resched(void) { - __might_sleep(__FILE__, __LINE__, 0); - add_preempt_count(PREEMPT_ACTIVE); schedule(); sub_preempt_count(PREEMPT_ACTIVE); @@ -6628,14 +6626,14 @@ int __sched _cond_resched(void) EXPORT_SYMBOL(_cond_resched); /* - * cond_resched_lock() - if a reschedule is pending, drop the given lock, + * __cond_resched_lock() - if a reschedule is pending, drop the given lock, * call schedule, and on return reacquire the lock. * * This works OK both with and without CONFIG_PREEMPT. We do strange low-level * operations here to prevent schedule() from being called twice (once via * spin_unlock(), once by hand). */ -int cond_resched_lock(spinlock_t *lock) +int __cond_resched_lock(spinlock_t *lock) { int resched = should_resched(); int ret = 0; @@ -6651,9 +6649,9 @@ int cond_resched_lock(spinlock_t *lock) } return ret; } -EXPORT_SYMBOL(cond_resched_lock); +EXPORT_SYMBOL(__cond_resched_lock); -int __sched cond_resched_softirq(void) +int __sched __cond_resched_softirq(void) { BUG_ON(!in_softirq()); @@ -6665,7 +6663,7 @@ int __sched cond_resched_softirq(void) } return 0; } -EXPORT_SYMBOL(cond_resched_softirq); +EXPORT_SYMBOL(__cond_resched_softirq); /** * yield - yield the current processor to other threads. -- cgit v1.2.3 From def01bc53d03881acfc393bd10a5c7575187e008 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: sched: Convert the only user of cond_resched_bkl to use cond_resched() fs/locks.c:flock_lock_file() is the only user of cond_resched_bkl() This helper doesn't do anything more than cond_resched(). The latter naming is enough to explain that we are rescheduling if needed. The bkl suffix suggests another semantics but it's actually a synonym of cond_resched(). Reported-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-7-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- fs/locks.c | 2 +- include/linux/sched.h | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/locks.c b/fs/locks.c index b6440f52178f..2eb81975c99c 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -768,7 +768,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) * give it the opportunity to lock the file. */ if (found) - cond_resched_bkl(); + cond_resched(); find_conflict: for_each_lock(inode, before) { diff --git a/include/linux/sched.h b/include/linux/sched.h index c41d424db887..cbbfca69aa4a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2305,8 +2305,6 @@ extern int __cond_resched_softirq(void); __cond_resched_softirq(); \ }) -#define cond_resched_bkl() cond_resched() - /* * Does a critical section need to be broken due to another * task waiting?: (technically does not depend on CONFIG_PREEMPT, -- cgit v1.2.3 From 254e0a6bff87ab8b22293c4bd1443507df698407 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sun, 19 Jul 2009 00:08:54 +0900 Subject: x86: Use get_desc_base() Use get_desc_base() to get the base address in desc_struct Signed-off-by: Akinobu Mita LKML-Reference: <20090718150853.GA11294@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/doublefault_32.c | 4 +--- arch/x86/kernel/step.c | 9 ++++----- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index b4f14c6c09d9..37250fe490b1 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -27,9 +27,7 @@ static void doublefault_fn(void) if (ptr_ok(gdt)) { gdt += GDT_ENTRY_TSS << 3; - tss = *(u16 *)(gdt+2); - tss += *(u8 *)(gdt+4) << 16; - tss += *(u8 *)(gdt+7) << 24; + tss = get_desc_base((struct desc_struct *)gdt); printk(KERN_EMERG "double fault, tss at %08lx\n", tss); if (ptr_ok(tss)) { diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index e8b9863ef8c4..3149032ff107 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -4,6 +4,7 @@ #include #include #include +#include unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs) { @@ -23,7 +24,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re * and APM bios ones we just ignore here. */ if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) { - u32 *desc; + struct desc_struct *desc; unsigned long base; seg &= ~7UL; @@ -33,12 +34,10 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re addr = -1L; /* bogus selector, access would fault */ else { desc = child->mm->context.ldt + seg; - base = ((desc[0] >> 16) | - ((desc[1] & 0xff) << 16) | - (desc[1] & 0xff000000)); + base = get_desc_base(desc); /* 16-bit code segment? */ - if (!((desc[1] >> 22) & 1)) + if (!desc->d) addr &= 0xffff; addr += base; } -- cgit v1.2.3 From fde0312d01b60a3fd5dc56e69a9613defbbc7097 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sun, 19 Jul 2009 00:09:56 +0900 Subject: x86: Remove unused patch_espfix_desc() patch_espfix_desc() is not used after commit dc4c2a0aed3b09f6e255bd5c3faa50fe6e0b2ded Signed-off-by: Akinobu Mita LKML-Reference: <20090718150955.GB11294@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/traps.h | 4 +--- arch/x86/kernel/traps.c | 21 --------------------- 2 files changed, 1 insertion(+), 24 deletions(-) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index bfd74c032fca..4da91ad69e0d 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -81,9 +81,7 @@ extern int panic_on_unrecovered_nmi; void math_error(void __user *); void math_emulate(struct math_emu_info *); -#ifdef CONFIG_X86_32 -unsigned long patch_espfix_desc(unsigned long, unsigned long); -#else +#ifndef CONFIG_X86_32 asmlinkage void smp_thermal_interrupt(void); asmlinkage void mce_threshold_interrupt(void); #endif diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 5204332f475d..236794110207 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -786,27 +786,6 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) #endif } -#ifdef CONFIG_X86_32 -unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) -{ - struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); - unsigned long base = (kesp - uesp) & -THREAD_SIZE; - unsigned long new_kesp = kesp - base; - unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; - __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS]; - - /* Set up base for espfix segment */ - desc &= 0x00f0ff0000000000ULL; - desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | - ((((__u64)base) << 32) & 0xff00000000000000ULL) | - ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) | - (lim_pages & 0xffff); - *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc; - - return new_kesp; -} -#endif - asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) { } -- cgit v1.2.3 From 57594742a2b545f8f114cda34f15650be8ae976d Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sun, 19 Jul 2009 00:11:06 +0900 Subject: x86: Introduce set_desc_base() and set_desc_limit() Rename set_base()/set_limit to set_desc_base()/set_desc_limit() and rewrite them in C. These are naturally introduced by the idea of get_desc_base()/get_desc_limit(). The conversion actually found the bug in apm_32.c: bad_bios_desc is written at run-time, but it is defined const variable. Signed-off-by: Akinobu Mita LKML-Reference: <20090718151105.GC11294@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc.h | 13 +++++++++++++ arch/x86/include/asm/stackprotector.h | 4 +--- arch/x86/include/asm/system.h | 27 --------------------------- arch/x86/kernel/apm_32.c | 18 +++++++++--------- drivers/pnp/pnpbios/bioscalls.c | 21 +++++++++++---------- 5 files changed, 34 insertions(+), 49 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index c993e9e0fed4..e8de2f6f5ca5 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -291,11 +291,24 @@ static inline unsigned long get_desc_base(const struct desc_struct *desc) return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24); } +static inline void set_desc_base(struct desc_struct *desc, unsigned long base) +{ + desc->base0 = base & 0xffff; + desc->base1 = (base >> 16) & 0xff; + desc->base2 = (base >> 24) & 0xff; +} + static inline unsigned long get_desc_limit(const struct desc_struct *desc) { return desc->limit0 | (desc->limit << 16); } +static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) +{ + desc->limit0 = limit & 0xffff; + desc->limit = (limit >> 16) & 0xf; +} + static inline void _set_gate(int gate, unsigned type, void *addr, unsigned dpl, unsigned ist, unsigned seg) { diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index c2d742c6e15f..cdc5e0b126a7 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -90,9 +90,7 @@ static inline void setup_stack_canary_segment(int cpu) struct desc_struct desc; desc = gdt_table[GDT_ENTRY_STACK_CANARY]; - desc.base0 = canary & 0xffff; - desc.base1 = (canary >> 16) & 0xff; - desc.base2 = (canary >> 24) & 0xff; + set_desc_base(&desc, canary); write_gdt_entry(gdt_table, GDT_ENTRY_STACK_CANARY, &desc, DESCTYPE_S); #endif } diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 643c59b4bc6e..75c49c782e20 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -150,33 +150,6 @@ do { \ #endif #ifdef __KERNEL__ -#define _set_base(addr, base) do { unsigned long __pr; \ -__asm__ __volatile__ ("movw %%dx,%1\n\t" \ - "rorl $16,%%edx\n\t" \ - "movb %%dl,%2\n\t" \ - "movb %%dh,%3" \ - :"=&d" (__pr) \ - :"m" (*((addr)+2)), \ - "m" (*((addr)+4)), \ - "m" (*((addr)+7)), \ - "0" (base) \ - ); } while (0) - -#define _set_limit(addr, limit) do { unsigned long __lr; \ -__asm__ __volatile__ ("movw %%dx,%1\n\t" \ - "rorl $16,%%edx\n\t" \ - "movb %2,%%dh\n\t" \ - "andb $0xf0,%%dh\n\t" \ - "orb %%dh,%%dl\n\t" \ - "movb %%dl,%2" \ - :"=&d" (__lr) \ - :"m" (*(addr)), \ - "m" (*((addr)+6)), \ - "0" (limit) \ - ); } while (0) - -#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base)) -#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1)) extern void native_load_gs_index(unsigned); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 79302e9a33a4..b5e841bd60d9 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -403,7 +403,7 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); static struct apm_user *user_list; static DEFINE_SPINLOCK(user_list_lock); -static const struct desc_struct bad_bios_desc = { { { 0, 0x00409200 } } }; +static struct desc_struct bad_bios_desc = { { { 0, 0x00409200 } } }; static const char driver_version[] = "1.16ac"; /* no spaces */ @@ -2337,8 +2337,8 @@ static int __init apm_init(void) * This is for buggy BIOS's that refer to (real mode) segment 0x40 * even though they are called in protected mode. */ - set_base(bad_bios_desc, __va((unsigned long)0x40 << 4)); - _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4)); + set_desc_base(&bad_bios_desc, (unsigned long)__va(0x40UL << 4)); + set_desc_limit(&bad_bios_desc, 4095 - (0x40 << 4)); /* * Set up the long jump entry point to the APM BIOS, which is called @@ -2358,12 +2358,12 @@ static int __init apm_init(void) * code to that CPU. */ gdt = get_cpu_gdt_table(0); - set_base(gdt[APM_CS >> 3], - __va((unsigned long)apm_info.bios.cseg << 4)); - set_base(gdt[APM_CS_16 >> 3], - __va((unsigned long)apm_info.bios.cseg_16 << 4)); - set_base(gdt[APM_DS >> 3], - __va((unsigned long)apm_info.bios.dseg << 4)); + set_desc_base(&gdt[APM_CS >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.cseg << 4)); + set_desc_base(&gdt[APM_CS_16 >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.cseg_16 << 4)); + set_desc_base(&gdt[APM_DS >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.dseg << 4)); proc_create("apm", 0, NULL, &apm_file_ops); diff --git a/drivers/pnp/pnpbios/bioscalls.c b/drivers/pnp/pnpbios/bioscalls.c index 7e6b5a3b3281..45ad3e9cc362 100644 --- a/drivers/pnp/pnpbios/bioscalls.c +++ b/drivers/pnp/pnpbios/bioscalls.c @@ -55,9 +55,9 @@ __asm__(".text \n" #define Q2_SET_SEL(cpu, selname, address, size) \ do { \ -struct desc_struct *gdt = get_cpu_gdt_table((cpu)); \ -set_base(gdt[(selname) >> 3], (u32)(address)); \ -set_limit(gdt[(selname) >> 3], size); \ + struct desc_struct *gdt = get_cpu_gdt_table((cpu)); \ + set_desc_base(&gdt[(selname) >> 3], (u32)(address)); \ + set_desc_limit(&gdt[(selname) >> 3], (size) - 1); \ } while(0) static struct desc_struct bad_bios_desc; @@ -479,16 +479,17 @@ void pnpbios_calls_init(union pnp_bios_install_struct *header) bad_bios_desc.a = 0; bad_bios_desc.b = 0x00409200; - set_base(bad_bios_desc, __va((unsigned long)0x40 << 4)); - _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4)); + set_desc_base(&bad_bios_desc, (unsigned long)__va(0x40UL << 4)); + set_desc_limit(&bad_bios_desc, 4095 - (0x40 << 4)); for_each_possible_cpu(i) { struct desc_struct *gdt = get_cpu_gdt_table(i); if (!gdt) continue; - set_base(gdt[GDT_ENTRY_PNPBIOS_CS32], &pnp_bios_callfunc); - set_base(gdt[GDT_ENTRY_PNPBIOS_CS16], - __va(header->fields.pm16cseg)); - set_base(gdt[GDT_ENTRY_PNPBIOS_DS], - __va(header->fields.pm16dseg)); + set_desc_base(&gdt[GDT_ENTRY_PNPBIOS_CS32], + (unsigned long)&pnp_bios_callfunc); + set_desc_base(&gdt[GDT_ENTRY_PNPBIOS_CS16], + (unsigned long)__va(header->fields.pm16cseg)); + set_desc_base(&gdt[GDT_ENTRY_PNPBIOS_DS], + (unsigned long)__va(header->fields.pm16dseg)); } } -- cgit v1.2.3 From 4d4036e0e7299c6cbb2d2421b4b30b7a409ce61a Mon Sep 17 00:00:00 2001 From: Jason Yeh Date: Wed, 8 Jul 2009 13:49:38 +0200 Subject: oprofile: Implement performance counter multiplexing The number of hardware counters is limited. The multiplexing feature enables OProfile to gather more events than counters are provided by the hardware. This is realized by switching between events at an user specified time interval. A new file (/dev/oprofile/time_slice) is added for the user to specify the timer interval in ms. If the number of events to profile is higher than the number of hardware counters available, the patch will schedule a work queue that switches the event counter and re-writes the different sets of values into it. The switching mechanism needs to be implemented for each architecture to support multiplexing. This patch only implements AMD CPU support, but multiplexing can be easily extended for other models and architectures. There are follow-on patches that rework parts of this patch. Signed-off-by: Jason Yeh Signed-off-by: Robert Richter --- arch/Kconfig | 12 +++ arch/x86/oprofile/nmi_int.c | 162 ++++++++++++++++++++++++++++++++++++-- arch/x86/oprofile/op_counter.h | 2 +- arch/x86/oprofile/op_model_amd.c | 110 ++++++++++++++++++++++---- arch/x86/oprofile/op_model_p4.c | 4 + arch/x86/oprofile/op_model_ppro.c | 2 + arch/x86/oprofile/op_x86_model.h | 7 ++ drivers/oprofile/oprof.c | 78 ++++++++++++++++++ drivers/oprofile/oprof.h | 2 + drivers/oprofile/oprofile_files.c | 43 ++++++++++ drivers/oprofile/oprofile_stats.c | 10 +++ include/linux/oprofile.h | 3 + 12 files changed, 415 insertions(+), 20 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 99193b160232..beea3ccebb5e 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -30,6 +30,18 @@ config OPROFILE_IBS If unsure, say N. +config OPROFILE_EVENT_MULTIPLEX + bool "OProfile multiplexing support (EXPERIMENTAL)" + default n + depends on OPROFILE && X86 + help + The number of hardware counters is limited. The multiplexing + feature enables OProfile to gather more events than counters + are provided by the hardware. This is realized by switching + between events at an user specified time interval. + + If unsure, say N. + config HAVE_OPROFILE bool diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index fca8dc94531e..e54f6a0b35ac 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -1,11 +1,14 @@ /** * @file nmi_int.c * - * @remark Copyright 2002-2008 OProfile authors + * @remark Copyright 2002-2009 OProfile authors * @remark Read the file COPYING * * @author John Levon * @author Robert Richter + * @author Barry Kasindorf + * @author Jason Yeh + * @author Suravee Suthikulpanit */ #include @@ -24,6 +27,12 @@ #include "op_counter.h" #include "op_x86_model.h" + +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX +DEFINE_PER_CPU(int, switch_index); +#endif + + static struct op_x86_model_spec const *model; static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); static DEFINE_PER_CPU(unsigned long, saved_lvtpc); @@ -31,6 +40,13 @@ static DEFINE_PER_CPU(unsigned long, saved_lvtpc); /* 0 == registered but off, 1 == registered and on */ static int nmi_enabled = 0; + +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX +extern atomic_t multiplex_counter; +#endif + +struct op_counter_config counter_config[OP_MAX_COUNTER]; + /* common functions */ u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, @@ -95,6 +111,11 @@ static void free_msrs(void) per_cpu(cpu_msrs, i).counters = NULL; kfree(per_cpu(cpu_msrs, i).controls); per_cpu(cpu_msrs, i).controls = NULL; + +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + kfree(per_cpu(cpu_msrs, i).multiplex); + per_cpu(cpu_msrs, i).multiplex = NULL; +#endif } } @@ -103,6 +124,9 @@ static int allocate_msrs(void) int success = 1; size_t controls_size = sizeof(struct op_msr) * model->num_controls; size_t counters_size = sizeof(struct op_msr) * model->num_counters; +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + size_t multiplex_size = sizeof(struct op_msr) * model->num_virt_counters; +#endif int i; for_each_possible_cpu(i) { @@ -118,6 +142,14 @@ static int allocate_msrs(void) success = 0; break; } +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + per_cpu(cpu_msrs, i).multiplex = + kmalloc(multiplex_size, GFP_KERNEL); + if (!per_cpu(cpu_msrs, i).multiplex) { + success = 0; + break; + } +#endif } if (!success) @@ -126,6 +158,25 @@ static int allocate_msrs(void) return success; } +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static void nmi_setup_cpu_mux(struct op_msrs const * const msrs) +{ + int i; + struct op_msr *multiplex = msrs->multiplex; + + for (i = 0; i < model->num_virt_counters; ++i) { + if (counter_config[i].enabled) { + multiplex[i].saved = -(u64)counter_config[i].count; + } else { + multiplex[i].addr = 0; + multiplex[i].saved = 0; + } + } +} + +#endif + static void nmi_cpu_setup(void *dummy) { int cpu = smp_processor_id(); @@ -133,6 +184,9 @@ static void nmi_cpu_setup(void *dummy) nmi_cpu_save_registers(msrs); spin_lock(&oprofilefs_lock); model->setup_ctrs(model, msrs); +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + nmi_setup_cpu_mux(msrs); +#endif spin_unlock(&oprofilefs_lock); per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC); apic_write(APIC_LVTPC, APIC_DM_NMI); @@ -173,14 +227,52 @@ static int nmi_setup(void) memcpy(per_cpu(cpu_msrs, cpu).controls, per_cpu(cpu_msrs, 0).controls, sizeof(struct op_msr) * model->num_controls); +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + memcpy(per_cpu(cpu_msrs, cpu).multiplex, + per_cpu(cpu_msrs, 0).multiplex, + sizeof(struct op_msr) * model->num_virt_counters); +#endif } - } on_each_cpu(nmi_cpu_setup, NULL, 1); nmi_enabled = 1; return 0; } +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) +{ + unsigned int si = __get_cpu_var(switch_index); + struct op_msr *multiplex = msrs->multiplex; + unsigned int i; + + for (i = 0; i < model->num_counters; ++i) { + int offset = i + si; + if (multiplex[offset].addr) { + rdmsrl(multiplex[offset].addr, + multiplex[offset].saved); + } + } +} + +static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) +{ + unsigned int si = __get_cpu_var(switch_index); + struct op_msr *multiplex = msrs->multiplex; + unsigned int i; + + for (i = 0; i < model->num_counters; ++i) { + int offset = i + si; + if (multiplex[offset].addr) { + wrmsrl(multiplex[offset].addr, + multiplex[offset].saved); + } + } +} + +#endif + static void nmi_cpu_restore_registers(struct op_msrs *msrs) { struct op_msr *counters = msrs->counters; @@ -214,6 +306,9 @@ static void nmi_cpu_shutdown(void *dummy) apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); apic_write(APIC_LVTERR, v); nmi_cpu_restore_registers(msrs); +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + __get_cpu_var(switch_index) = 0; +#endif } static void nmi_shutdown(void) @@ -252,16 +347,15 @@ static void nmi_stop(void) on_each_cpu(nmi_cpu_stop, NULL, 1); } -struct op_counter_config counter_config[OP_MAX_COUNTER]; - static int nmi_create_files(struct super_block *sb, struct dentry *root) { unsigned int i; - for (i = 0; i < model->num_counters; ++i) { + for (i = 0; i < model->num_virt_counters; ++i) { struct dentry *dir; char buf[4]; +#ifndef CONFIG_OPROFILE_EVENT_MULTIPLEX /* quick little hack to _not_ expose a counter if it is not * available for use. This should protect userspace app. * NOTE: assumes 1:1 mapping here (that counters are organized @@ -269,6 +363,7 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root) */ if (unlikely(!avail_to_resrv_perfctr_nmi_bit(i))) continue; +#endif /* CONFIG_OPROFILE_EVENT_MULTIPLEX */ snprintf(buf, sizeof(buf), "%d", i); dir = oprofilefs_mkdir(sb, root, buf); @@ -283,6 +378,57 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root) return 0; } +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static void nmi_cpu_switch(void *dummy) +{ + int cpu = smp_processor_id(); + int si = per_cpu(switch_index, cpu); + struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); + + nmi_cpu_stop(NULL); + nmi_cpu_save_mpx_registers(msrs); + + /* move to next set */ + si += model->num_counters; + if ((si > model->num_virt_counters) || (counter_config[si].count == 0)) + per_cpu(switch_index, cpu) = 0; + else + per_cpu(switch_index, cpu) = si; + + model->switch_ctrl(model, msrs); + nmi_cpu_restore_mpx_registers(msrs); + + nmi_cpu_start(NULL); +} + + +/* + * Quick check to see if multiplexing is necessary. + * The check should be sufficient since counters are used + * in ordre. + */ +static int nmi_multiplex_on(void) +{ + return counter_config[model->num_counters].count ? 0 : -EINVAL; +} + +static int nmi_switch_event(void) +{ + if (!model->switch_ctrl) + return -ENOSYS; /* not implemented */ + if (nmi_multiplex_on() < 0) + return -EINVAL; /* not necessary */ + + on_each_cpu(nmi_cpu_switch, NULL, 1); + + atomic_inc(&multiplex_counter); + + return 0; +} + +#endif + #ifdef CONFIG_SMP static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action, void *data) @@ -516,12 +662,18 @@ int __init op_nmi_init(struct oprofile_operations *ops) register_cpu_notifier(&oprofile_cpu_nb); #endif /* default values, can be overwritten by model */ +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + __raw_get_cpu_var(switch_index) = 0; +#endif ops->create_files = nmi_create_files; ops->setup = nmi_setup; ops->shutdown = nmi_shutdown; ops->start = nmi_start; ops->stop = nmi_stop; ops->cpu_type = cpu_type; +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + ops->switch_events = nmi_switch_event; +#endif if (model->init) ret = model->init(ops); diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h index 91b6a116165e..e28398df0df2 100644 --- a/arch/x86/oprofile/op_counter.h +++ b/arch/x86/oprofile/op_counter.h @@ -10,7 +10,7 @@ #ifndef OP_COUNTER_H #define OP_COUNTER_H -#define OP_MAX_COUNTER 8 +#define OP_MAX_COUNTER 32 /* Per-perfctr configuration as set via * oprofilefs. diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index f676f8825a3f..fdbed3a0c877 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -9,12 +9,15 @@ * @author Philippe Elie * @author Graydon Hoare * @author Robert Richter - * @author Barry Kasindorf + * @author Barry Kasindorf + * @author Jason Yeh + * @author Suravee Suthikulpanit */ #include #include #include +#include #include #include @@ -25,12 +28,23 @@ #define NUM_COUNTERS 4 #define NUM_CONTROLS 4 +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX +#define NUM_VIRT_COUNTERS 32 +#define NUM_VIRT_CONTROLS 32 +#else +#define NUM_VIRT_COUNTERS NUM_COUNTERS +#define NUM_VIRT_CONTROLS NUM_CONTROLS +#endif + #define OP_EVENT_MASK 0x0FFF #define OP_CTR_OVERFLOW (1ULL<<31) #define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21)) -static unsigned long reset_value[NUM_COUNTERS]; +static unsigned long reset_value[NUM_VIRT_COUNTERS]; +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX +DECLARE_PER_CPU(int, switch_index); +#endif #ifdef CONFIG_OPROFILE_IBS @@ -82,6 +96,16 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) else msrs->controls[i].addr = 0; } + +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + for (i = 0; i < NUM_VIRT_COUNTERS; i++) { + int hw_counter = i % NUM_CONTROLS; + if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) + msrs->multiplex[i].addr = MSR_K7_PERFCTR0 + hw_counter; + else + msrs->multiplex[i].addr = 0; + } +#endif } static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, @@ -90,6 +114,15 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, u64 val; int i; + /* setup reset_value */ + for (i = 0; i < NUM_VIRT_COUNTERS; ++i) { + if (counter_config[i].enabled) { + reset_value[i] = counter_config[i].count; + } else { + reset_value[i] = 0; + } + } + /* clear all counters */ for (i = 0; i < NUM_CONTROLS; ++i) { if (unlikely(!msrs->controls[i].addr)) @@ -108,20 +141,49 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, /* enable active counters */ for (i = 0; i < NUM_COUNTERS; ++i) { - if (counter_config[i].enabled && msrs->counters[i].addr) { - reset_value[i] = counter_config[i].count; - wrmsrl(msrs->counters[i].addr, - -(u64)counter_config[i].count); +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + int offset = i + __get_cpu_var(switch_index); +#else + int offset = i; +#endif + if (counter_config[offset].enabled && msrs->counters[i].addr) { + /* setup counter registers */ + wrmsrl(msrs->counters[i].addr, -(u64)reset_value[offset]); + + /* setup control registers */ rdmsrl(msrs->controls[i].addr, val); val &= model->reserved; - val |= op_x86_get_ctrl(model, &counter_config[i]); + val |= op_x86_get_ctrl(model, &counter_config[offset]); + wrmsrl(msrs->controls[i].addr, val); + } + } +} + + +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static void op_amd_switch_ctrl(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) +{ + u64 val; + int i; + + /* enable active counters */ + for (i = 0; i < NUM_COUNTERS; ++i) { + int offset = i + __get_cpu_var(switch_index); + if (counter_config[offset].enabled) { + /* setup control registers */ + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[offset]); wrmsrl(msrs->controls[i].addr, val); - } else { - reset_value[i] = 0; } } } +#endif + + #ifdef CONFIG_OPROFILE_IBS static inline int @@ -230,14 +292,19 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, int i; for (i = 0; i < NUM_COUNTERS; ++i) { - if (!reset_value[i]) +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + int offset = i + __get_cpu_var(switch_index); +#else + int offset = i; +#endif + if (!reset_value[offset]) continue; rdmsrl(msrs->counters[i].addr, val); /* bit is clear if overflowed: */ if (val & OP_CTR_OVERFLOW) continue; - oprofile_add_sample(regs, i); - wrmsrl(msrs->counters[i].addr, -(u64)reset_value[i]); + oprofile_add_sample(regs, offset); + wrmsrl(msrs->counters[i].addr, -(u64)reset_value[offset]); } op_amd_handle_ibs(regs, msrs); @@ -250,8 +317,14 @@ static void op_amd_start(struct op_msrs const * const msrs) { u64 val; int i; + for (i = 0; i < NUM_COUNTERS; ++i) { - if (reset_value[i]) { +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + int offset = i + __get_cpu_var(switch_index); +#else + int offset = i; +#endif + if (reset_value[offset]) { rdmsrl(msrs->controls[i].addr, val); val |= ARCH_PERFMON_EVENTSEL0_ENABLE; wrmsrl(msrs->controls[i].addr, val); @@ -271,7 +344,11 @@ static void op_amd_stop(struct op_msrs const * const msrs) * pm callback */ for (i = 0; i < NUM_COUNTERS; ++i) { +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + if (!reset_value[i + per_cpu(switch_index, smp_processor_id())]) +#else if (!reset_value[i]) +#endif continue; rdmsrl(msrs->controls[i].addr, val); val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -289,7 +366,7 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) if (msrs->counters[i].addr) release_perfctr_nmi(MSR_K7_PERFCTR0 + i); } - for (i = 0; i < NUM_CONTROLS; ++i) { + for (i = 0; i < NUM_COUNTERS; ++i) { if (msrs->controls[i].addr) release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); } @@ -463,6 +540,8 @@ static void op_amd_exit(void) {} struct op_x86_model_spec const op_amd_spec = { .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, + .num_virt_counters = NUM_VIRT_COUNTERS, + .num_virt_controls = NUM_VIRT_CONTROLS, .reserved = MSR_AMD_EVENTSEL_RESERVED, .event_mask = OP_EVENT_MASK, .init = op_amd_init, @@ -473,4 +552,7 @@ struct op_x86_model_spec const op_amd_spec = { .start = &op_amd_start, .stop = &op_amd_stop, .shutdown = &op_amd_shutdown, +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + .switch_ctrl = &op_amd_switch_ctrl, +#endif }; diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 5921b7fc724b..65b9237cde8b 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -698,6 +698,8 @@ static void p4_shutdown(struct op_msrs const * const msrs) struct op_x86_model_spec const op_p4_ht2_spec = { .num_counters = NUM_COUNTERS_HT2, .num_controls = NUM_CONTROLS_HT2, + .num_virt_counters = NUM_COUNTERS_HT2, + .num_virt_controls = NUM_CONTROLS_HT2, .fill_in_addresses = &p4_fill_in_addresses, .setup_ctrs = &p4_setup_ctrs, .check_ctrs = &p4_check_ctrs, @@ -710,6 +712,8 @@ struct op_x86_model_spec const op_p4_ht2_spec = { struct op_x86_model_spec const op_p4_spec = { .num_counters = NUM_COUNTERS_NON_HT, .num_controls = NUM_CONTROLS_NON_HT, + .num_virt_counters = NUM_COUNTERS_NON_HT, + .num_virt_controls = NUM_CONTROLS_NON_HT, .fill_in_addresses = &p4_fill_in_addresses, .setup_ctrs = &p4_setup_ctrs, .check_ctrs = &p4_check_ctrs, diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 570d717c3308..098cbca5c0b0 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -206,6 +206,8 @@ static void ppro_shutdown(struct op_msrs const * const msrs) struct op_x86_model_spec const op_ppro_spec = { .num_counters = 2, .num_controls = 2, + .num_virt_counters = 2, + .num_virt_controls = 2, .reserved = MSR_PPRO_EVENTSEL_RESERVED, .fill_in_addresses = &ppro_fill_in_addresses, .setup_ctrs = &ppro_setup_ctrs, diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 505489873b9d..0d07d23cb062 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -23,6 +23,7 @@ struct op_msr { struct op_msrs { struct op_msr *counters; struct op_msr *controls; + struct op_msr *multiplex; }; struct pt_regs; @@ -35,6 +36,8 @@ struct oprofile_operations; struct op_x86_model_spec { unsigned int num_counters; unsigned int num_controls; + unsigned int num_virt_counters; + unsigned int num_virt_controls; u64 reserved; u16 event_mask; int (*init)(struct oprofile_operations *ops); @@ -47,6 +50,10 @@ struct op_x86_model_spec { void (*start)(struct op_msrs const * const msrs); void (*stop)(struct op_msrs const * const msrs); void (*shutdown)(struct op_msrs const * const msrs); +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + void (*switch_ctrl)(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs); +#endif }; struct op_counter_config; diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c index 3cffce90f82a..7bc64af7cf99 100644 --- a/drivers/oprofile/oprof.c +++ b/drivers/oprofile/oprof.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include "oprof.h" @@ -27,6 +29,15 @@ unsigned long oprofile_backtrace_depth; static unsigned long is_setup; static DEFINE_MUTEX(start_mutex); +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static void switch_worker(struct work_struct *work); +static DECLARE_DELAYED_WORK(switch_work, switch_worker); +unsigned long timeout_jiffies; +#define MULTIPLEXING_TIMER_DEFAULT 1 + +#endif + /* timer 0 - use performance monitoring hardware if available 1 - use the timer int mechanism regardless @@ -87,6 +98,20 @@ out: return err; } +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static void start_switch_worker(void) +{ + schedule_delayed_work(&switch_work, timeout_jiffies); +} + +static void switch_worker(struct work_struct *work) +{ + if (!oprofile_ops.switch_events()) + start_switch_worker(); +} + +#endif /* Actually start profiling (echo 1>/dev/oprofile/enable) */ int oprofile_start(void) @@ -108,6 +133,11 @@ int oprofile_start(void) if ((err = oprofile_ops.start())) goto out; +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + if (oprofile_ops.switch_events) + start_switch_worker(); +#endif + oprofile_started = 1; out: mutex_unlock(&start_mutex); @@ -123,6 +153,11 @@ void oprofile_stop(void) goto out; oprofile_ops.stop(); oprofile_started = 0; + +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + cancel_delayed_work_sync(&switch_work); +#endif + /* wake up the daemon to read what remains */ wake_up_buffer_waiter(); out: @@ -155,6 +190,36 @@ post_sync: mutex_unlock(&start_mutex); } +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +/* User inputs in ms, converts to jiffies */ +int oprofile_set_timeout(unsigned long val_msec) +{ + int err = 0; + + mutex_lock(&start_mutex); + + if (oprofile_started) { + err = -EBUSY; + goto out; + } + + if (!oprofile_ops.switch_events) { + err = -EINVAL; + goto out; + } + + timeout_jiffies = msecs_to_jiffies(val_msec); + if (timeout_jiffies == MAX_JIFFY_OFFSET) + timeout_jiffies = msecs_to_jiffies(MULTIPLEXING_TIMER_DEFAULT); + +out: + mutex_unlock(&start_mutex); + return err; + +} + +#endif int oprofile_set_backtrace(unsigned long val) { @@ -179,10 +244,23 @@ out: return err; } +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static void __init oprofile_multiplexing_init(void) +{ + timeout_jiffies = msecs_to_jiffies(MULTIPLEXING_TIMER_DEFAULT); +} + +#endif + static int __init oprofile_init(void) { int err; +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + oprofile_multiplexing_init(); +#endif + err = oprofile_arch_init(&oprofile_ops); if (err < 0 || timer) { diff --git a/drivers/oprofile/oprof.h b/drivers/oprofile/oprof.h index c288d3c24b50..ee38abcc74f3 100644 --- a/drivers/oprofile/oprof.h +++ b/drivers/oprofile/oprof.h @@ -27,6 +27,7 @@ extern unsigned long oprofile_buffer_watershed; extern struct oprofile_operations oprofile_ops; extern unsigned long oprofile_started; extern unsigned long oprofile_backtrace_depth; +extern unsigned long timeout_jiffies; struct super_block; struct dentry; @@ -35,5 +36,6 @@ void oprofile_create_files(struct super_block *sb, struct dentry *root); void oprofile_timer_init(struct oprofile_operations *ops); int oprofile_set_backtrace(unsigned long depth); +int oprofile_set_timeout(unsigned long time); #endif /* OPROF_H */ diff --git a/drivers/oprofile/oprofile_files.c b/drivers/oprofile/oprofile_files.c index 5d36ffc30dd5..468ec3e4f856 100644 --- a/drivers/oprofile/oprofile_files.c +++ b/drivers/oprofile/oprofile_files.c @@ -9,6 +9,7 @@ #include #include +#include #include "event_buffer.h" #include "oprofile_stats.h" @@ -22,6 +23,45 @@ unsigned long oprofile_buffer_size; unsigned long oprofile_cpu_buffer_size; unsigned long oprofile_buffer_watershed; +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static ssize_t timeout_read(struct file *file, char __user *buf, + size_t count, loff_t *offset) +{ + return oprofilefs_ulong_to_user(jiffies_to_msecs(timeout_jiffies), + buf, count, offset); +} + + +static ssize_t timeout_write(struct file *file, char const __user *buf, + size_t count, loff_t *offset) +{ + unsigned long val; + int retval; + + if (*offset) + return -EINVAL; + + retval = oprofilefs_ulong_from_user(&val, buf, count); + if (retval) + return retval; + + retval = oprofile_set_timeout(val); + + if (retval) + return retval; + return count; +} + + +static const struct file_operations timeout_fops = { + .read = timeout_read, + .write = timeout_write, +}; + +#endif + + static ssize_t depth_read(struct file *file, char __user *buf, size_t count, loff_t *offset) { return oprofilefs_ulong_to_user(oprofile_backtrace_depth, buf, count, @@ -139,6 +179,9 @@ void oprofile_create_files(struct super_block *sb, struct dentry *root) oprofilefs_create_file(sb, root, "cpu_type", &cpu_type_fops); oprofilefs_create_file(sb, root, "backtrace_depth", &depth_fops); oprofilefs_create_file(sb, root, "pointer_size", &pointer_size_fops); +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + oprofilefs_create_file(sb, root, "time_slice", &timeout_fops); +#endif oprofile_create_stats_files(sb, root); if (oprofile_ops.create_files) oprofile_ops.create_files(sb, root); diff --git a/drivers/oprofile/oprofile_stats.c b/drivers/oprofile/oprofile_stats.c index 3c2270a8300c..77a57a6792f6 100644 --- a/drivers/oprofile/oprofile_stats.c +++ b/drivers/oprofile/oprofile_stats.c @@ -16,6 +16,9 @@ #include "cpu_buffer.h" struct oprofile_stat_struct oprofile_stats; +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX +atomic_t multiplex_counter; +#endif void oprofile_reset_stats(void) { @@ -34,6 +37,9 @@ void oprofile_reset_stats(void) atomic_set(&oprofile_stats.sample_lost_no_mapping, 0); atomic_set(&oprofile_stats.event_lost_overflow, 0); atomic_set(&oprofile_stats.bt_lost_no_mapping, 0); +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + atomic_set(&multiplex_counter, 0); +#endif } @@ -76,4 +82,8 @@ void oprofile_create_stats_files(struct super_block *sb, struct dentry *root) &oprofile_stats.event_lost_overflow); oprofilefs_create_ro_atomic(sb, dir, "bt_lost_no_mapping", &oprofile_stats.bt_lost_no_mapping); +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + oprofilefs_create_ro_atomic(sb, dir, "multiplex_counter", + &multiplex_counter); +#endif } diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h index d68d2ed94f15..5171639ecf0f 100644 --- a/include/linux/oprofile.h +++ b/include/linux/oprofile.h @@ -67,6 +67,9 @@ struct oprofile_operations { /* Initiate a stack backtrace. Optional. */ void (*backtrace)(struct pt_regs * const regs, unsigned int depth); + + /* Multiplex between different events. Optional. */ + int (*switch_events)(void); /* CPU identification string. */ char * cpu_type; }; -- cgit v1.2.3 From 5e766e3e433fa2d5d2fdfd8e2432804c91393387 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 8 Jul 2009 14:54:17 +0200 Subject: x86/oprofile: Fix usage of NUM_CONTROLS/NUM_COUNTERS macros Use the corresponding macros when iterating over counter and control registers. Since NUM_CONTROLS and NUM_COUNTERS are equal for AMD cpus the fix is more a cosmetical change. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index fdbed3a0c877..dcfd4505cacc 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -99,7 +99,7 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX for (i = 0; i < NUM_VIRT_COUNTERS; i++) { - int hw_counter = i % NUM_CONTROLS; + int hw_counter = i % NUM_COUNTERS; if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) msrs->multiplex[i].addr = MSR_K7_PERFCTR0 + hw_counter; else @@ -366,7 +366,7 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) if (msrs->counters[i].addr) release_perfctr_nmi(MSR_K7_PERFCTR0 + i); } - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < NUM_CONTROLS; ++i) { if (msrs->controls[i].addr) release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); } -- cgit v1.2.3 From 82a225283fb0d9438549595d9e6f3ecc42b42ad6 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 9 Jul 2009 16:29:34 +0200 Subject: x86/oprofile: Use per_cpu() instead of __get_cpu_var() __get_cpu_var() calls smp_processor_id(). When the cpu id is already known, instead use per_cpu() to avoid generating the id again. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index e54f6a0b35ac..8cd4658370be 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -294,7 +294,7 @@ static void nmi_cpu_shutdown(void *dummy) { unsigned int v; int cpu = smp_processor_id(); - struct op_msrs *msrs = &__get_cpu_var(cpu_msrs); + struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); /* restoring APIC_LVTPC can trigger an apic error because the delivery * mode and vector nr combination can be illegal. That's by design: on @@ -307,7 +307,7 @@ static void nmi_cpu_shutdown(void *dummy) apic_write(APIC_LVTERR, v); nmi_cpu_restore_registers(msrs); #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - __get_cpu_var(switch_index) = 0; + per_cpu(switch_index, cpu) = 0; #endif } -- cgit v1.2.3 From 6bfccd099c2841e1c42530f1b6d2553bfa13be3a Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 9 Jul 2009 19:23:50 +0200 Subject: x86/oprofile: Fix initialization of switch_index Variable switch_index must be initialized for each cpu. This patch fixes the initialization by moving it to the per-cpu init function nmi_cpu_setup(). Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 8cd4658370be..b211d335e075 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -160,7 +160,7 @@ static int allocate_msrs(void) #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX -static void nmi_setup_cpu_mux(struct op_msrs const * const msrs) +static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { int i; struct op_msr *multiplex = msrs->multiplex; @@ -173,8 +173,15 @@ static void nmi_setup_cpu_mux(struct op_msrs const * const msrs) multiplex[i].saved = 0; } } + + per_cpu(switch_index, cpu) = 0; } +#else + +static inline void +nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { } + #endif static void nmi_cpu_setup(void *dummy) @@ -184,9 +191,7 @@ static void nmi_cpu_setup(void *dummy) nmi_cpu_save_registers(msrs); spin_lock(&oprofilefs_lock); model->setup_ctrs(model, msrs); -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - nmi_setup_cpu_mux(msrs); -#endif + nmi_cpu_setup_mux(cpu, msrs); spin_unlock(&oprofilefs_lock); per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC); apic_write(APIC_LVTPC, APIC_DM_NMI); @@ -662,9 +667,6 @@ int __init op_nmi_init(struct oprofile_operations *ops) register_cpu_notifier(&oprofile_cpu_nb); #endif /* default values, can be overwritten by model */ -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - __raw_get_cpu_var(switch_index) = 0; -#endif ops->create_files = nmi_create_files; ops->setup = nmi_setup; ops->shutdown = nmi_shutdown; -- cgit v1.2.3 From 2051cade7ccbe45a8bf8b7809d56b23d6d75ad03 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 15 Jul 2009 15:44:18 +0200 Subject: oprofile: oprofile_set_timeout(), return with error for invalid args Return with -EINVAL for invalid parameters instead of setting the default value in oprofile_set_timeout(). Signed-off-by: Robert Richter --- drivers/oprofile/oprof.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c index 7bc64af7cf99..42c9c765f9f1 100644 --- a/drivers/oprofile/oprof.c +++ b/drivers/oprofile/oprof.c @@ -196,6 +196,7 @@ post_sync: int oprofile_set_timeout(unsigned long val_msec) { int err = 0; + unsigned long time_slice; mutex_lock(&start_mutex); @@ -209,9 +210,13 @@ int oprofile_set_timeout(unsigned long val_msec) goto out; } - timeout_jiffies = msecs_to_jiffies(val_msec); - if (timeout_jiffies == MAX_JIFFY_OFFSET) - timeout_jiffies = msecs_to_jiffies(MULTIPLEXING_TIMER_DEFAULT); + time_slice = msecs_to_jiffies(val_msec); + if (time_slice == MAX_JIFFY_OFFSET) { + err = -EINVAL; + goto out; + } + + timeout_jiffies = time_slice; out: mutex_unlock(&start_mutex); -- cgit v1.2.3 From afe1b50fe6aa56093e9234bdc08779e9fe20b5bf Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 15 Jul 2009 15:19:29 +0200 Subject: oprofile: Rename variable timeout_jiffies and move to oprofile_files.c This patch renames timeout_jiffies into an oprofile specific name. The macro MULTIPLEXING_TIMER_DEFAULT is changed too. Also, since this variable is controlled using oprofilefs, its definition is moved to oprofile_files.c. Signed-off-by: Robert Richter --- drivers/oprofile/oprof.c | 9 ++++----- drivers/oprofile/oprof.h | 3 ++- drivers/oprofile/oprofile_files.c | 5 +++-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c index 42c9c765f9f1..2b33de716412 100644 --- a/drivers/oprofile/oprof.c +++ b/drivers/oprofile/oprof.c @@ -33,8 +33,7 @@ static DEFINE_MUTEX(start_mutex); static void switch_worker(struct work_struct *work); static DECLARE_DELAYED_WORK(switch_work, switch_worker); -unsigned long timeout_jiffies; -#define MULTIPLEXING_TIMER_DEFAULT 1 +#define TIME_SLICE_DEFAULT 1 #endif @@ -102,7 +101,7 @@ out: static void start_switch_worker(void) { - schedule_delayed_work(&switch_work, timeout_jiffies); + schedule_delayed_work(&switch_work, oprofile_time_slice); } static void switch_worker(struct work_struct *work) @@ -216,7 +215,7 @@ int oprofile_set_timeout(unsigned long val_msec) goto out; } - timeout_jiffies = time_slice; + oprofile_time_slice = time_slice; out: mutex_unlock(&start_mutex); @@ -253,7 +252,7 @@ out: static void __init oprofile_multiplexing_init(void) { - timeout_jiffies = msecs_to_jiffies(MULTIPLEXING_TIMER_DEFAULT); + oprofile_time_slice = msecs_to_jiffies(TIME_SLICE_DEFAULT); } #endif diff --git a/drivers/oprofile/oprof.h b/drivers/oprofile/oprof.h index ee38abcc74f3..cb92f5c98c1a 100644 --- a/drivers/oprofile/oprof.h +++ b/drivers/oprofile/oprof.h @@ -24,10 +24,11 @@ struct oprofile_operations; extern unsigned long oprofile_buffer_size; extern unsigned long oprofile_cpu_buffer_size; extern unsigned long oprofile_buffer_watershed; +extern unsigned long oprofile_time_slice; + extern struct oprofile_operations oprofile_ops; extern unsigned long oprofile_started; extern unsigned long oprofile_backtrace_depth; -extern unsigned long timeout_jiffies; struct super_block; struct dentry; diff --git a/drivers/oprofile/oprofile_files.c b/drivers/oprofile/oprofile_files.c index 468ec3e4f856..4c5b94775844 100644 --- a/drivers/oprofile/oprofile_files.c +++ b/drivers/oprofile/oprofile_files.c @@ -22,14 +22,15 @@ unsigned long oprofile_buffer_size; unsigned long oprofile_cpu_buffer_size; unsigned long oprofile_buffer_watershed; +unsigned long oprofile_time_slice; #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX static ssize_t timeout_read(struct file *file, char __user *buf, size_t count, loff_t *offset) { - return oprofilefs_ulong_to_user(jiffies_to_msecs(timeout_jiffies), - buf, count, offset); + return oprofilefs_ulong_to_user(jiffies_to_msecs(oprofile_time_slice), + buf, count, offset); } -- cgit v1.2.3 From 16422a6e2d16a39861ae5b0b11d9b411a245ab83 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 15 Jul 2009 15:44:18 +0200 Subject: oprofile: Remove oprofile_multiplexing_init() oprofile_multiplexing_init() can be removed when moving the initialization of oprofile_time_slice to oprofile_create_files(). Signed-off-by: Robert Richter --- drivers/oprofile/oprof.c | 14 -------------- drivers/oprofile/oprofile_files.c | 2 ++ 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c index 2b33de716412..fa6cccd87cf8 100644 --- a/drivers/oprofile/oprof.c +++ b/drivers/oprofile/oprof.c @@ -33,7 +33,6 @@ static DEFINE_MUTEX(start_mutex); static void switch_worker(struct work_struct *work); static DECLARE_DELAYED_WORK(switch_work, switch_worker); -#define TIME_SLICE_DEFAULT 1 #endif @@ -248,23 +247,10 @@ out: return err; } -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - -static void __init oprofile_multiplexing_init(void) -{ - oprofile_time_slice = msecs_to_jiffies(TIME_SLICE_DEFAULT); -} - -#endif - static int __init oprofile_init(void) { int err; -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - oprofile_multiplexing_init(); -#endif - err = oprofile_arch_init(&oprofile_ops); if (err < 0 || timer) { diff --git a/drivers/oprofile/oprofile_files.c b/drivers/oprofile/oprofile_files.c index 4c5b94775844..bbd7516e0869 100644 --- a/drivers/oprofile/oprofile_files.c +++ b/drivers/oprofile/oprofile_files.c @@ -18,6 +18,7 @@ #define BUFFER_SIZE_DEFAULT 131072 #define CPU_BUFFER_SIZE_DEFAULT 8192 #define BUFFER_WATERSHED_DEFAULT 32768 /* FIXME: tune */ +#define TIME_SLICE_DEFAULT 1 unsigned long oprofile_buffer_size; unsigned long oprofile_cpu_buffer_size; @@ -170,6 +171,7 @@ void oprofile_create_files(struct super_block *sb, struct dentry *root) oprofile_buffer_size = BUFFER_SIZE_DEFAULT; oprofile_cpu_buffer_size = CPU_BUFFER_SIZE_DEFAULT; oprofile_buffer_watershed = BUFFER_WATERSHED_DEFAULT; + oprofile_time_slice = msecs_to_jiffies(TIME_SLICE_DEFAULT); oprofilefs_create_file(sb, root, "enable", &enable_fops); oprofilefs_create_file_perm(sb, root, "dump", &dump_fops, 0666); -- cgit v1.2.3 From a5659d17adb815fb35e11745e2f39c3f0bfd579f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 19 Jun 2009 16:45:34 +0200 Subject: oprofile: Grouping multiplexing code in oprof.c This patch moves multiplexing code to a single section of code. This reduces the use of #ifdefs especially within functions. Signed-off-by: Robert Richter --- drivers/oprofile/oprof.c | 100 +++++++++++++++++++++++------------------------ 1 file changed, 49 insertions(+), 51 deletions(-) diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c index fa6cccd87cf8..a48294a8ebe8 100644 --- a/drivers/oprofile/oprof.c +++ b/drivers/oprofile/oprof.c @@ -29,13 +29,6 @@ unsigned long oprofile_backtrace_depth; static unsigned long is_setup; static DEFINE_MUTEX(start_mutex); -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - -static void switch_worker(struct work_struct *work); -static DECLARE_DELAYED_WORK(switch_work, switch_worker); - -#endif - /* timer 0 - use performance monitoring hardware if available 1 - use the timer int mechanism regardless @@ -98,9 +91,18 @@ out: #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX +static void switch_worker(struct work_struct *work); +static DECLARE_DELAYED_WORK(switch_work, switch_worker); + static void start_switch_worker(void) { - schedule_delayed_work(&switch_work, oprofile_time_slice); + if (oprofile_ops.switch_events) + schedule_delayed_work(&switch_work, oprofile_time_slice); +} + +static void stop_switch_worker(void) +{ + cancel_delayed_work_sync(&switch_work); } static void switch_worker(struct work_struct *work) @@ -109,6 +111,43 @@ static void switch_worker(struct work_struct *work) start_switch_worker(); } +/* User inputs in ms, converts to jiffies */ +int oprofile_set_timeout(unsigned long val_msec) +{ + int err = 0; + unsigned long time_slice; + + mutex_lock(&start_mutex); + + if (oprofile_started) { + err = -EBUSY; + goto out; + } + + if (!oprofile_ops.switch_events) { + err = -EINVAL; + goto out; + } + + time_slice = msecs_to_jiffies(val_msec); + if (time_slice == MAX_JIFFY_OFFSET) { + err = -EINVAL; + goto out; + } + + oprofile_time_slice = time_slice; + +out: + mutex_unlock(&start_mutex); + return err; + +} + +#else + +static inline void start_switch_worker(void) { } +static inline void stop_switch_worker(void) { } + #endif /* Actually start profiling (echo 1>/dev/oprofile/enable) */ @@ -131,10 +170,7 @@ int oprofile_start(void) if ((err = oprofile_ops.start())) goto out; -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - if (oprofile_ops.switch_events) - start_switch_worker(); -#endif + start_switch_worker(); oprofile_started = 1; out: @@ -152,9 +188,7 @@ void oprofile_stop(void) oprofile_ops.stop(); oprofile_started = 0; -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - cancel_delayed_work_sync(&switch_work); -#endif + stop_switch_worker(); /* wake up the daemon to read what remains */ wake_up_buffer_waiter(); @@ -188,42 +222,6 @@ post_sync: mutex_unlock(&start_mutex); } -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - -/* User inputs in ms, converts to jiffies */ -int oprofile_set_timeout(unsigned long val_msec) -{ - int err = 0; - unsigned long time_slice; - - mutex_lock(&start_mutex); - - if (oprofile_started) { - err = -EBUSY; - goto out; - } - - if (!oprofile_ops.switch_events) { - err = -EINVAL; - goto out; - } - - time_slice = msecs_to_jiffies(val_msec); - if (time_slice == MAX_JIFFY_OFFSET) { - err = -EINVAL; - goto out; - } - - oprofile_time_slice = time_slice; - -out: - mutex_unlock(&start_mutex); - return err; - -} - -#endif - int oprofile_set_backtrace(unsigned long val) { int err = 0; -- cgit v1.2.3 From d8471ad3ab613a1ba7abd3aad46659de39a2871c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 16 Jul 2009 13:04:43 +0200 Subject: oprofile: Introduce op_x86_phys_to_virt() This new function translates physical to virtual counter numbers. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 43 +++++++++++---------- arch/x86/oprofile/op_model_amd.c | 80 ++++++++++++++++------------------------ arch/x86/oprofile/op_x86_model.h | 1 + 3 files changed, 55 insertions(+), 69 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index b211d335e075..02b57b8d0e61 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -27,12 +27,6 @@ #include "op_counter.h" #include "op_x86_model.h" - -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX -DEFINE_PER_CPU(int, switch_index); -#endif - - static struct op_x86_model_spec const *model; static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); static DEFINE_PER_CPU(unsigned long, saved_lvtpc); @@ -103,6 +97,21 @@ static void nmi_cpu_save_registers(struct op_msrs *msrs) } } +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static DEFINE_PER_CPU(int, switch_index); + +inline int op_x86_phys_to_virt(int phys) +{ + return __get_cpu_var(switch_index) + phys; +} + +#else + +inline int op_x86_phys_to_virt(int phys) { return phys; } + +#endif + static void free_msrs(void) { int i; @@ -248,31 +257,25 @@ static int nmi_setup(void) static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) { - unsigned int si = __get_cpu_var(switch_index); struct op_msr *multiplex = msrs->multiplex; - unsigned int i; + int i; for (i = 0; i < model->num_counters; ++i) { - int offset = i + si; - if (multiplex[offset].addr) { - rdmsrl(multiplex[offset].addr, - multiplex[offset].saved); - } + int virt = op_x86_phys_to_virt(i); + if (multiplex[virt].addr) + rdmsrl(multiplex[virt].addr, multiplex[virt].saved); } } static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) { - unsigned int si = __get_cpu_var(switch_index); struct op_msr *multiplex = msrs->multiplex; - unsigned int i; + int i; for (i = 0; i < model->num_counters; ++i) { - int offset = i + si; - if (multiplex[offset].addr) { - wrmsrl(multiplex[offset].addr, - multiplex[offset].saved); - } + int virt = op_x86_phys_to_virt(i); + if (multiplex[virt].addr) + wrmsrl(multiplex[virt].addr, multiplex[virt].saved); } } diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index dcfd4505cacc..67f830d12e0e 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -42,9 +42,6 @@ #define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21)) static unsigned long reset_value[NUM_VIRT_COUNTERS]; -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX -DECLARE_PER_CPU(int, switch_index); -#endif #ifdef CONFIG_OPROFILE_IBS @@ -141,21 +138,20 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, /* enable active counters */ for (i = 0; i < NUM_COUNTERS; ++i) { -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - int offset = i + __get_cpu_var(switch_index); -#else - int offset = i; -#endif - if (counter_config[offset].enabled && msrs->counters[i].addr) { - /* setup counter registers */ - wrmsrl(msrs->counters[i].addr, -(u64)reset_value[offset]); - - /* setup control registers */ - rdmsrl(msrs->controls[i].addr, val); - val &= model->reserved; - val |= op_x86_get_ctrl(model, &counter_config[offset]); - wrmsrl(msrs->controls[i].addr, val); - } + int virt = op_x86_phys_to_virt(i); + if (!counter_config[virt].enabled) + continue; + if (!msrs->counters[i].addr) + continue; + + /* setup counter registers */ + wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]); + + /* setup control registers */ + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[virt]); + wrmsrl(msrs->controls[i].addr, val); } } @@ -170,14 +166,13 @@ static void op_amd_switch_ctrl(struct op_x86_model_spec const *model, /* enable active counters */ for (i = 0; i < NUM_COUNTERS; ++i) { - int offset = i + __get_cpu_var(switch_index); - if (counter_config[offset].enabled) { - /* setup control registers */ - rdmsrl(msrs->controls[i].addr, val); - val &= model->reserved; - val |= op_x86_get_ctrl(model, &counter_config[offset]); - wrmsrl(msrs->controls[i].addr, val); - } + int virt = op_x86_phys_to_virt(i); + if (!counter_config[virt].enabled) + continue; + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[virt]); + wrmsrl(msrs->controls[i].addr, val); } } @@ -292,19 +287,15 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, int i; for (i = 0; i < NUM_COUNTERS; ++i) { -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - int offset = i + __get_cpu_var(switch_index); -#else - int offset = i; -#endif - if (!reset_value[offset]) + int virt = op_x86_phys_to_virt(i); + if (!reset_value[virt]) continue; rdmsrl(msrs->counters[i].addr, val); /* bit is clear if overflowed: */ if (val & OP_CTR_OVERFLOW) continue; - oprofile_add_sample(regs, offset); - wrmsrl(msrs->counters[i].addr, -(u64)reset_value[offset]); + oprofile_add_sample(regs, virt); + wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]); } op_amd_handle_ibs(regs, msrs); @@ -319,16 +310,11 @@ static void op_amd_start(struct op_msrs const * const msrs) int i; for (i = 0; i < NUM_COUNTERS; ++i) { -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - int offset = i + __get_cpu_var(switch_index); -#else - int offset = i; -#endif - if (reset_value[offset]) { - rdmsrl(msrs->controls[i].addr, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(msrs->controls[i].addr, val); - } + if (!reset_value[op_x86_phys_to_virt(i)]) + continue; + rdmsrl(msrs->controls[i].addr, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(msrs->controls[i].addr, val); } op_amd_start_ibs(); @@ -344,11 +330,7 @@ static void op_amd_stop(struct op_msrs const * const msrs) * pm callback */ for (i = 0; i < NUM_COUNTERS; ++i) { -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - if (!reset_value[i + per_cpu(switch_index, smp_processor_id())]) -#else - if (!reset_value[i]) -#endif + if (!reset_value[op_x86_phys_to_virt(i)]) continue; rdmsrl(msrs->controls[i].addr, val); val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 0d07d23cb062..e874dc3565ae 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -60,6 +60,7 @@ struct op_counter_config; extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, struct op_counter_config *counter_config); +extern int op_x86_phys_to_virt(int phys); extern struct op_x86_model_spec const op_ppro_spec; extern struct op_x86_model_spec const op_p4_spec; -- cgit v1.2.3 From 7e7478c6bc0e011d2854b21f190cc3a1dba89905 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 16 Jul 2009 13:09:53 +0200 Subject: oprofile: Grouping multiplexing code in op_model_amd.c This patch moves some multiplexing code to the new function op_mux_fill_in_addresses(). Also, the whole multiplexing code is now at a single location. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 75 ++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 34 deletions(-) diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 67f830d12e0e..644980f03924 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -74,6 +74,45 @@ static struct op_ibs_config ibs_config; #endif +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static void op_mux_fill_in_addresses(struct op_msrs * const msrs) +{ + int i; + + for (i = 0; i < NUM_VIRT_COUNTERS; i++) { + int hw_counter = i % NUM_COUNTERS; + if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) + msrs->multiplex[i].addr = MSR_K7_PERFCTR0 + hw_counter; + else + msrs->multiplex[i].addr = 0; + } +} + +static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) +{ + u64 val; + int i; + + /* enable active counters */ + for (i = 0; i < NUM_COUNTERS; ++i) { + int virt = op_x86_phys_to_virt(i); + if (!counter_config[virt].enabled) + continue; + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[virt]); + wrmsrl(msrs->controls[i].addr, val); + } +} + +#else + +static inline void op_mux_fill_in_addresses(struct op_msrs * const msrs) { } + +#endif + /* functions for op_amd_spec */ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) @@ -94,15 +133,7 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) msrs->controls[i].addr = 0; } -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - for (i = 0; i < NUM_VIRT_COUNTERS; i++) { - int hw_counter = i % NUM_COUNTERS; - if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) - msrs->multiplex[i].addr = MSR_K7_PERFCTR0 + hw_counter; - else - msrs->multiplex[i].addr = 0; - } -#endif + op_mux_fill_in_addresses(msrs); } static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, @@ -155,30 +186,6 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, } } - -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - -static void op_amd_switch_ctrl(struct op_x86_model_spec const *model, - struct op_msrs const * const msrs) -{ - u64 val; - int i; - - /* enable active counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { - int virt = op_x86_phys_to_virt(i); - if (!counter_config[virt].enabled) - continue; - rdmsrl(msrs->controls[i].addr, val); - val &= model->reserved; - val |= op_x86_get_ctrl(model, &counter_config[virt]); - wrmsrl(msrs->controls[i].addr, val); - } -} - -#endif - - #ifdef CONFIG_OPROFILE_IBS static inline int @@ -535,6 +542,6 @@ struct op_x86_model_spec const op_amd_spec = { .stop = &op_amd_stop, .shutdown = &op_amd_shutdown, #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - .switch_ctrl = &op_amd_switch_ctrl, + .switch_ctrl = &op_mux_switch_ctrl, #endif }; -- cgit v1.2.3 From 6ab82f958a5dca591a6ea17a3ca6f2aca06f4f2f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 9 Jul 2009 14:40:04 +0200 Subject: x86/oprofile: Implement multiplexing setup/shutdown functions This patch implements nmi_setup_mux() and nmi_shutdown_mux() functions to setup/shutdown multiplexing. Multiplexing code in nmi_int.c is now much more separated. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 76 ++++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 36 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 02b57b8d0e61..674fa37d1502 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -106,9 +106,35 @@ inline int op_x86_phys_to_virt(int phys) return __get_cpu_var(switch_index) + phys; } +static void nmi_shutdown_mux(void) +{ + int i; + for_each_possible_cpu(i) { + kfree(per_cpu(cpu_msrs, i).multiplex); + per_cpu(cpu_msrs, i).multiplex = NULL; + per_cpu(switch_index, i) = 0; + } +} + +static int nmi_setup_mux(void) +{ + size_t multiplex_size = + sizeof(struct op_msr) * model->num_virt_counters; + int i; + for_each_possible_cpu(i) { + per_cpu(cpu_msrs, i).multiplex = + kmalloc(multiplex_size, GFP_KERNEL); + if (!per_cpu(cpu_msrs, i).multiplex) + return 0; + } + return 1; +} + #else inline int op_x86_phys_to_virt(int phys) { return phys; } +static inline void nmi_shutdown_mux(void) { } +static inline int nmi_setup_mux(void) { return 1; } #endif @@ -120,51 +146,27 @@ static void free_msrs(void) per_cpu(cpu_msrs, i).counters = NULL; kfree(per_cpu(cpu_msrs, i).controls); per_cpu(cpu_msrs, i).controls = NULL; - -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - kfree(per_cpu(cpu_msrs, i).multiplex); - per_cpu(cpu_msrs, i).multiplex = NULL; -#endif } } static int allocate_msrs(void) { - int success = 1; size_t controls_size = sizeof(struct op_msr) * model->num_controls; size_t counters_size = sizeof(struct op_msr) * model->num_counters; -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - size_t multiplex_size = sizeof(struct op_msr) * model->num_virt_counters; -#endif int i; for_each_possible_cpu(i) { per_cpu(cpu_msrs, i).counters = kmalloc(counters_size, - GFP_KERNEL); - if (!per_cpu(cpu_msrs, i).counters) { - success = 0; - break; - } + GFP_KERNEL); + if (!per_cpu(cpu_msrs, i).counters) + return 0; per_cpu(cpu_msrs, i).controls = kmalloc(controls_size, - GFP_KERNEL); - if (!per_cpu(cpu_msrs, i).controls) { - success = 0; - break; - } -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - per_cpu(cpu_msrs, i).multiplex = - kmalloc(multiplex_size, GFP_KERNEL); - if (!per_cpu(cpu_msrs, i).multiplex) { - success = 0; - break; - } -#endif + GFP_KERNEL); + if (!per_cpu(cpu_msrs, i).controls) + return 0; } - if (!success) - free_msrs(); - - return success; + return 1; } #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX @@ -218,11 +220,15 @@ static int nmi_setup(void) int cpu; if (!allocate_msrs()) - return -ENOMEM; + err = -ENOMEM; + else if (!nmi_setup_mux()) + err = -ENOMEM; + else + err = register_die_notifier(&profile_exceptions_nb); - err = register_die_notifier(&profile_exceptions_nb); if (err) { free_msrs(); + nmi_shutdown_mux(); return err; } @@ -314,9 +320,6 @@ static void nmi_cpu_shutdown(void *dummy) apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); apic_write(APIC_LVTERR, v); nmi_cpu_restore_registers(msrs); -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - per_cpu(switch_index, cpu) = 0; -#endif } static void nmi_shutdown(void) @@ -326,6 +329,7 @@ static void nmi_shutdown(void) nmi_enabled = 0; on_each_cpu(nmi_cpu_shutdown, NULL, 1); unregister_die_notifier(&profile_exceptions_nb); + nmi_shutdown_mux(); msrs = &get_cpu_var(cpu_msrs); model->shutdown(msrs); free_msrs(); -- cgit v1.2.3 From 48fb4b46712c7d3e8adc79826311abd9ccbf7f1d Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 9 Jul 2009 14:38:49 +0200 Subject: x86/oprofile: Moving nmi_setup_cpu_mux() in nmi_int.c This patch moves some code in nmi_int.c to get a single separate multiplexing code section. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 45 +++++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 674fa37d1502..b1edfc922e7f 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -130,11 +130,30 @@ static int nmi_setup_mux(void) return 1; } +static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) +{ + int i; + struct op_msr *multiplex = msrs->multiplex; + + for (i = 0; i < model->num_virt_counters; ++i) { + if (counter_config[i].enabled) { + multiplex[i].saved = -(u64)counter_config[i].count; + } else { + multiplex[i].addr = 0; + multiplex[i].saved = 0; + } + } + + per_cpu(switch_index, cpu) = 0; +} + #else inline int op_x86_phys_to_virt(int phys) { return phys; } static inline void nmi_shutdown_mux(void) { } static inline int nmi_setup_mux(void) { return 1; } +static inline void +nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { } #endif @@ -169,32 +188,6 @@ static int allocate_msrs(void) return 1; } -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - -static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) -{ - int i; - struct op_msr *multiplex = msrs->multiplex; - - for (i = 0; i < model->num_virt_counters; ++i) { - if (counter_config[i].enabled) { - multiplex[i].saved = -(u64)counter_config[i].count; - } else { - multiplex[i].addr = 0; - multiplex[i].saved = 0; - } - } - - per_cpu(switch_index, cpu) = 0; -} - -#else - -static inline void -nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { } - -#endif - static void nmi_cpu_setup(void *dummy) { int cpu = smp_processor_id(); -- cgit v1.2.3 From d0f585dd20010f8479e56b5c6f391ef18e26877e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 9 Jul 2009 14:38:49 +0200 Subject: x86/oprofile: Moving nmi_cpu_save/restore_mpx_registers() in nmi_int.c This patch moves some code in nmi_int.c to get a single separate multiplexing code section. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 52 +++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index b1edfc922e7f..f38c5cf0fdbb 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -147,6 +147,30 @@ static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) per_cpu(switch_index, cpu) = 0; } +static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) +{ + struct op_msr *multiplex = msrs->multiplex; + int i; + + for (i = 0; i < model->num_counters; ++i) { + int virt = op_x86_phys_to_virt(i); + if (multiplex[virt].addr) + rdmsrl(multiplex[virt].addr, multiplex[virt].saved); + } +} + +static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) +{ + struct op_msr *multiplex = msrs->multiplex; + int i; + + for (i = 0; i < model->num_counters; ++i) { + int virt = op_x86_phys_to_virt(i); + if (multiplex[virt].addr) + wrmsrl(multiplex[virt].addr, multiplex[virt].saved); + } +} + #else inline int op_x86_phys_to_virt(int phys) { return phys; } @@ -252,34 +276,6 @@ static int nmi_setup(void) return 0; } -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - -static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) -{ - struct op_msr *multiplex = msrs->multiplex; - int i; - - for (i = 0; i < model->num_counters; ++i) { - int virt = op_x86_phys_to_virt(i); - if (multiplex[virt].addr) - rdmsrl(multiplex[virt].addr, multiplex[virt].saved); - } -} - -static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) -{ - struct op_msr *multiplex = msrs->multiplex; - int i; - - for (i = 0; i < model->num_counters; ++i) { - int virt = op_x86_phys_to_virt(i); - if (multiplex[virt].addr) - wrmsrl(multiplex[virt].addr, multiplex[virt].saved); - } -} - -#endif - static void nmi_cpu_restore_registers(struct op_msrs *msrs) { struct op_msr *counters = msrs->counters; -- cgit v1.2.3 From b28d1b923ab52d535c0719155dccf3b3d98bab9f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 9 Jul 2009 14:38:49 +0200 Subject: x86/oprofile: Moving nmi_cpu_switch() in nmi_int.c This patch moves some code in nmi_int.c to get a single separate multiplexing code section. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 144 +++++++++++++++++++++----------------------- 1 file changed, 70 insertions(+), 74 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index f38c5cf0fdbb..998c7dca31e7 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -97,6 +97,29 @@ static void nmi_cpu_save_registers(struct op_msrs *msrs) } } +static void nmi_cpu_start(void *dummy) +{ + struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); + model->start(msrs); +} + +static int nmi_start(void) +{ + on_each_cpu(nmi_cpu_start, NULL, 1); + return 0; +} + +static void nmi_cpu_stop(void *dummy) +{ + struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); + model->stop(msrs); +} + +static void nmi_stop(void) +{ + on_each_cpu(nmi_cpu_stop, NULL, 1); +} + #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX static DEFINE_PER_CPU(int, switch_index); @@ -171,6 +194,53 @@ static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) } } +static void nmi_cpu_switch(void *dummy) +{ + int cpu = smp_processor_id(); + int si = per_cpu(switch_index, cpu); + struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); + + nmi_cpu_stop(NULL); + nmi_cpu_save_mpx_registers(msrs); + + /* move to next set */ + si += model->num_counters; + if ((si > model->num_virt_counters) || (counter_config[si].count == 0)) + per_cpu(switch_index, cpu) = 0; + else + per_cpu(switch_index, cpu) = si; + + model->switch_ctrl(model, msrs); + nmi_cpu_restore_mpx_registers(msrs); + + nmi_cpu_start(NULL); +} + + +/* + * Quick check to see if multiplexing is necessary. + * The check should be sufficient since counters are used + * in ordre. + */ +static int nmi_multiplex_on(void) +{ + return counter_config[model->num_counters].count ? 0 : -EINVAL; +} + +static int nmi_switch_event(void) +{ + if (!model->switch_ctrl) + return -ENOSYS; /* not implemented */ + if (nmi_multiplex_on() < 0) + return -EINVAL; /* not necessary */ + + on_each_cpu(nmi_cpu_switch, NULL, 1); + + atomic_inc(&multiplex_counter); + + return 0; +} + #else inline int op_x86_phys_to_virt(int phys) { return phys; } @@ -325,29 +395,6 @@ static void nmi_shutdown(void) put_cpu_var(cpu_msrs); } -static void nmi_cpu_start(void *dummy) -{ - struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); - model->start(msrs); -} - -static int nmi_start(void) -{ - on_each_cpu(nmi_cpu_start, NULL, 1); - return 0; -} - -static void nmi_cpu_stop(void *dummy) -{ - struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); - model->stop(msrs); -} - -static void nmi_stop(void) -{ - on_each_cpu(nmi_cpu_stop, NULL, 1); -} - static int nmi_create_files(struct super_block *sb, struct dentry *root) { unsigned int i; @@ -379,57 +426,6 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root) return 0; } -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - -static void nmi_cpu_switch(void *dummy) -{ - int cpu = smp_processor_id(); - int si = per_cpu(switch_index, cpu); - struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); - - nmi_cpu_stop(NULL); - nmi_cpu_save_mpx_registers(msrs); - - /* move to next set */ - si += model->num_counters; - if ((si > model->num_virt_counters) || (counter_config[si].count == 0)) - per_cpu(switch_index, cpu) = 0; - else - per_cpu(switch_index, cpu) = si; - - model->switch_ctrl(model, msrs); - nmi_cpu_restore_mpx_registers(msrs); - - nmi_cpu_start(NULL); -} - - -/* - * Quick check to see if multiplexing is necessary. - * The check should be sufficient since counters are used - * in ordre. - */ -static int nmi_multiplex_on(void) -{ - return counter_config[model->num_counters].count ? 0 : -EINVAL; -} - -static int nmi_switch_event(void) -{ - if (!model->switch_ctrl) - return -ENOSYS; /* not implemented */ - if (nmi_multiplex_on() < 0) - return -EINVAL; /* not necessary */ - - on_each_cpu(nmi_cpu_switch, NULL, 1); - - atomic_inc(&multiplex_counter); - - return 0; -} - -#endif - #ifdef CONFIG_SMP static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action, void *data) -- cgit v1.2.3 From 259a83a8abdb9d2664819ec80ad12ebaeb251e32 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 9 Jul 2009 15:12:35 +0200 Subject: x86/oprofile: Remove const qualifier from struct op_x86_model_spec This patch removes the const qualifier from struct op_x86_model_spec to make model parameters changable. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 4 ++-- arch/x86/oprofile/op_model_amd.c | 2 +- arch/x86/oprofile/op_model_p4.c | 4 ++-- arch/x86/oprofile/op_model_ppro.c | 2 +- arch/x86/oprofile/op_x86_model.h | 8 ++++---- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 998c7dca31e7..826f391b4229 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -27,7 +27,7 @@ #include "op_counter.h" #include "op_x86_model.h" -static struct op_x86_model_spec const *model; +static struct op_x86_model_spec *model; static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); static DEFINE_PER_CPU(unsigned long, saved_lvtpc); @@ -542,7 +542,7 @@ module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0); static int __init ppro_init(char **cpu_type) { __u8 cpu_model = boot_cpu_data.x86_model; - struct op_x86_model_spec const *spec = &op_ppro_spec; /* default */ + struct op_x86_model_spec *spec = &op_ppro_spec; /* default */ if (force_arch_perfmon && cpu_has_arch_perfmon) return 0; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 644980f03924..39604b429d69 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -526,7 +526,7 @@ static void op_amd_exit(void) {} #endif /* CONFIG_OPROFILE_IBS */ -struct op_x86_model_spec const op_amd_spec = { +struct op_x86_model_spec op_amd_spec = { .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, .num_virt_counters = NUM_VIRT_COUNTERS, diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 65b9237cde8b..40df028d0d91 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -695,7 +695,7 @@ static void p4_shutdown(struct op_msrs const * const msrs) #ifdef CONFIG_SMP -struct op_x86_model_spec const op_p4_ht2_spec = { +struct op_x86_model_spec op_p4_ht2_spec = { .num_counters = NUM_COUNTERS_HT2, .num_controls = NUM_CONTROLS_HT2, .num_virt_counters = NUM_COUNTERS_HT2, @@ -709,7 +709,7 @@ struct op_x86_model_spec const op_p4_ht2_spec = { }; #endif -struct op_x86_model_spec const op_p4_spec = { +struct op_x86_model_spec op_p4_spec = { .num_counters = NUM_COUNTERS_NON_HT, .num_controls = NUM_CONTROLS_NON_HT, .num_virt_counters = NUM_COUNTERS_NON_HT, diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 098cbca5c0b0..659f3b6f86fb 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -203,7 +203,7 @@ static void ppro_shutdown(struct op_msrs const * const msrs) } -struct op_x86_model_spec const op_ppro_spec = { +struct op_x86_model_spec op_ppro_spec = { .num_counters = 2, .num_controls = 2, .num_virt_counters = 2, diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index e874dc3565ae..0c886fa0369c 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -62,10 +62,10 @@ extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, struct op_counter_config *counter_config); extern int op_x86_phys_to_virt(int phys); -extern struct op_x86_model_spec const op_ppro_spec; -extern struct op_x86_model_spec const op_p4_spec; -extern struct op_x86_model_spec const op_p4_ht2_spec; -extern struct op_x86_model_spec const op_amd_spec; +extern struct op_x86_model_spec op_ppro_spec; +extern struct op_x86_model_spec op_p4_spec; +extern struct op_x86_model_spec op_p4_ht2_spec; +extern struct op_x86_model_spec op_amd_spec; extern struct op_x86_model_spec op_arch_perfmon_spec; #endif /* OP_X86_MODEL_H */ -- cgit v1.2.3 From 2904a527575344a804fdd82b1f8d09a8731d8d49 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 9 Jul 2009 12:33:41 +0200 Subject: x86/oprofile: Remove unused num_virt_controls from struct op_x86_model_spec The member num_virt_controls of struct op_x86_model_spec is not used. This patch removes it. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 1 - arch/x86/oprofile/op_model_p4.c | 2 -- arch/x86/oprofile/op_model_ppro.c | 1 - arch/x86/oprofile/op_x86_model.h | 1 - 4 files changed, 5 deletions(-) diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 39604b429d69..dce69b5979e6 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -530,7 +530,6 @@ struct op_x86_model_spec op_amd_spec = { .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, .num_virt_counters = NUM_VIRT_COUNTERS, - .num_virt_controls = NUM_VIRT_CONTROLS, .reserved = MSR_AMD_EVENTSEL_RESERVED, .event_mask = OP_EVENT_MASK, .init = op_amd_init, diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 40df028d0d91..0a4f2deb9e8f 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -699,7 +699,6 @@ struct op_x86_model_spec op_p4_ht2_spec = { .num_counters = NUM_COUNTERS_HT2, .num_controls = NUM_CONTROLS_HT2, .num_virt_counters = NUM_COUNTERS_HT2, - .num_virt_controls = NUM_CONTROLS_HT2, .fill_in_addresses = &p4_fill_in_addresses, .setup_ctrs = &p4_setup_ctrs, .check_ctrs = &p4_check_ctrs, @@ -713,7 +712,6 @@ struct op_x86_model_spec op_p4_spec = { .num_counters = NUM_COUNTERS_NON_HT, .num_controls = NUM_CONTROLS_NON_HT, .num_virt_counters = NUM_COUNTERS_NON_HT, - .num_virt_controls = NUM_CONTROLS_NON_HT, .fill_in_addresses = &p4_fill_in_addresses, .setup_ctrs = &p4_setup_ctrs, .check_ctrs = &p4_check_ctrs, diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 659f3b6f86fb..753a02ab215b 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -207,7 +207,6 @@ struct op_x86_model_spec op_ppro_spec = { .num_counters = 2, .num_controls = 2, .num_virt_counters = 2, - .num_virt_controls = 2, .reserved = MSR_PPRO_EVENTSEL_RESERVED, .fill_in_addresses = &ppro_fill_in_addresses, .setup_ctrs = &ppro_setup_ctrs, diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 0c886fa0369c..4e2e7c2c519f 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -37,7 +37,6 @@ struct op_x86_model_spec { unsigned int num_counters; unsigned int num_controls; unsigned int num_virt_counters; - unsigned int num_virt_controls; u64 reserved; u16 event_mask; int (*init)(struct oprofile_operations *ops); -- cgit v1.2.3 From 52471c67ee2fa5ed6f700ef57bf27833c63b2192 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 6 Jul 2009 14:43:55 +0200 Subject: x86/oprofile: Modify initialization of num_virt_counters Models that do not yet support counter multiplexing have to setup num_virt_counters. This patch implements the setup from num_counters if num_virt_counters is not set. Thus, num_virt_counters must be setup only for multiplexing support. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 3 +++ arch/x86/oprofile/op_model_p4.c | 2 -- arch/x86/oprofile/op_model_ppro.c | 1 - 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 826f391b4229..82ee29517f16 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -674,6 +674,9 @@ int __init op_nmi_init(struct oprofile_operations *ops) if (ret) return ret; + if (!model->num_virt_counters) + model->num_virt_counters = model->num_counters; + init_sysfs(); using_nmi = 1; printk(KERN_INFO "oprofile: using NMI interrupt.\n"); diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 0a4f2deb9e8f..ac6b354becdf 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -698,7 +698,6 @@ static void p4_shutdown(struct op_msrs const * const msrs) struct op_x86_model_spec op_p4_ht2_spec = { .num_counters = NUM_COUNTERS_HT2, .num_controls = NUM_CONTROLS_HT2, - .num_virt_counters = NUM_COUNTERS_HT2, .fill_in_addresses = &p4_fill_in_addresses, .setup_ctrs = &p4_setup_ctrs, .check_ctrs = &p4_check_ctrs, @@ -711,7 +710,6 @@ struct op_x86_model_spec op_p4_ht2_spec = { struct op_x86_model_spec op_p4_spec = { .num_counters = NUM_COUNTERS_NON_HT, .num_controls = NUM_CONTROLS_NON_HT, - .num_virt_counters = NUM_COUNTERS_NON_HT, .fill_in_addresses = &p4_fill_in_addresses, .setup_ctrs = &p4_setup_ctrs, .check_ctrs = &p4_check_ctrs, diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 753a02ab215b..4899215999de 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -206,7 +206,6 @@ static void ppro_shutdown(struct op_msrs const * const msrs) struct op_x86_model_spec op_ppro_spec = { .num_counters = 2, .num_controls = 2, - .num_virt_counters = 2, .reserved = MSR_PPRO_EVENTSEL_RESERVED, .fill_in_addresses = &ppro_fill_in_addresses, .setup_ctrs = &ppro_setup_ctrs, -- cgit v1.2.3 From 39e97f40c3a5e71de0532368deaa683e09b74ba2 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 9 Jul 2009 15:11:45 +0200 Subject: x86/oprofile: Add function has_mux() to check multiplexing support The check is used to prevent running multiplexing code for models not supporting multiplexing. Before, the code was running but without effect. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 82ee29517f16..dca7240aeb26 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -124,6 +124,11 @@ static void nmi_stop(void) static DEFINE_PER_CPU(int, switch_index); +static inline int has_mux(void) +{ + return !!model->switch_ctrl; +} + inline int op_x86_phys_to_virt(int phys) { return __get_cpu_var(switch_index) + phys; @@ -132,6 +137,10 @@ inline int op_x86_phys_to_virt(int phys) static void nmi_shutdown_mux(void) { int i; + + if (!has_mux()) + return; + for_each_possible_cpu(i) { kfree(per_cpu(cpu_msrs, i).multiplex); per_cpu(cpu_msrs, i).multiplex = NULL; @@ -144,12 +153,17 @@ static int nmi_setup_mux(void) size_t multiplex_size = sizeof(struct op_msr) * model->num_virt_counters; int i; + + if (!has_mux()) + return 1; + for_each_possible_cpu(i) { per_cpu(cpu_msrs, i).multiplex = kmalloc(multiplex_size, GFP_KERNEL); if (!per_cpu(cpu_msrs, i).multiplex) return 0; } + return 1; } @@ -158,6 +172,9 @@ static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) int i; struct op_msr *multiplex = msrs->multiplex; + if (!has_mux()) + return; + for (i = 0; i < model->num_virt_counters; ++i) { if (counter_config[i].enabled) { multiplex[i].saved = -(u64)counter_config[i].count; @@ -229,7 +246,7 @@ static int nmi_multiplex_on(void) static int nmi_switch_event(void) { - if (!model->switch_ctrl) + if (!has_mux()) return -ENOSYS; /* not implemented */ if (nmi_multiplex_on() < 0) return -EINVAL; /* not necessary */ -- cgit v1.2.3 From 5280514471c2803776701c43c027038decac1103 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 9 Jul 2009 16:02:44 +0200 Subject: x86/oprofile: Enable multiplexing only if the model supports it This patch checks if the model supports multiplexing. Only then multiplexing will be enabled. The code is added to the common x86 initialization. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index dca7240aeb26..f0fb44725d80 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -258,6 +258,12 @@ static int nmi_switch_event(void) return 0; } +static inline void mux_init(struct oprofile_operations *ops) +{ + if (has_mux()) + ops->switch_events = nmi_switch_event; +} + #else inline int op_x86_phys_to_virt(int phys) { return phys; } @@ -265,6 +271,7 @@ static inline void nmi_shutdown_mux(void) { } static inline int nmi_setup_mux(void) { return 1; } static inline void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { } +static inline void mux_init(struct oprofile_operations *ops) { } #endif @@ -682,9 +689,6 @@ int __init op_nmi_init(struct oprofile_operations *ops) ops->start = nmi_start; ops->stop = nmi_stop; ops->cpu_type = cpu_type; -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - ops->switch_events = nmi_switch_event; -#endif if (model->init) ret = model->init(ops); @@ -694,6 +698,8 @@ int __init op_nmi_init(struct oprofile_operations *ops) if (!model->num_virt_counters) model->num_virt_counters = model->num_counters; + mux_init(ops); + init_sysfs(); using_nmi = 1; printk(KERN_INFO "oprofile: using NMI interrupt.\n"); -- cgit v1.2.3 From 4d015f79e972cea1761cfee8872b1c0992ccd8b2 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 9 Jul 2009 21:42:51 +0200 Subject: x86/oprofile: Implement mux_clone() To setup a counter for all cpus, its structure is cloned from cpu 0. This patch implements mux_clone() to do this part for multiplexing data. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index f0fb44725d80..da6d2ab31c6c 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -264,6 +264,16 @@ static inline void mux_init(struct oprofile_operations *ops) ops->switch_events = nmi_switch_event; } +static void mux_clone(int cpu) +{ + if (!has_mux()) + return; + + memcpy(per_cpu(cpu_msrs, cpu).multiplex, + per_cpu(cpu_msrs, 0).multiplex, + sizeof(struct op_msr) * model->num_virt_counters); +} + #else inline int op_x86_phys_to_virt(int phys) { return phys; } @@ -272,6 +282,7 @@ static inline int nmi_setup_mux(void) { return 1; } static inline void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { } static inline void mux_init(struct oprofile_operations *ops) { } +static void mux_clone(int cpu) { } #endif @@ -350,20 +361,18 @@ static int nmi_setup(void) /* Assume saved/restored counters are the same on all CPUs */ model->fill_in_addresses(&per_cpu(cpu_msrs, 0)); for_each_possible_cpu(cpu) { - if (cpu != 0) { - memcpy(per_cpu(cpu_msrs, cpu).counters, - per_cpu(cpu_msrs, 0).counters, - sizeof(struct op_msr) * model->num_counters); - - memcpy(per_cpu(cpu_msrs, cpu).controls, - per_cpu(cpu_msrs, 0).controls, - sizeof(struct op_msr) * model->num_controls); -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - memcpy(per_cpu(cpu_msrs, cpu).multiplex, - per_cpu(cpu_msrs, 0).multiplex, - sizeof(struct op_msr) * model->num_virt_counters); -#endif - } + if (!cpu) + continue; + + memcpy(per_cpu(cpu_msrs, cpu).counters, + per_cpu(cpu_msrs, 0).counters, + sizeof(struct op_msr) * model->num_counters); + + memcpy(per_cpu(cpu_msrs, cpu).controls, + per_cpu(cpu_msrs, 0).controls, + sizeof(struct op_msr) * model->num_controls); + + mux_clone(cpu); } on_each_cpu(nmi_cpu_setup, NULL, 1); nmi_enabled = 1; -- cgit v1.2.3 From 1b294f5960cd89e49eeb3e797860c552b03f2272 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 9 Jul 2009 14:56:25 +0200 Subject: oprofile: Adding switch counter to oprofile statistic variables This patch moves the multiplexing switch counter from x86 code to common oprofile statistic variables. Now the value will be available and usable for all architectures. The initialization and incrementation also moved to common code. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 7 ------- drivers/oprofile/oprof.c | 7 +++++-- drivers/oprofile/oprofile_stats.c | 9 ++------- drivers/oprofile/oprofile_stats.h | 1 + 4 files changed, 8 insertions(+), 16 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index da6d2ab31c6c..7b3362f9abdb 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -34,11 +34,6 @@ static DEFINE_PER_CPU(unsigned long, saved_lvtpc); /* 0 == registered but off, 1 == registered and on */ static int nmi_enabled = 0; - -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX -extern atomic_t multiplex_counter; -#endif - struct op_counter_config counter_config[OP_MAX_COUNTER]; /* common functions */ @@ -253,8 +248,6 @@ static int nmi_switch_event(void) on_each_cpu(nmi_cpu_switch, NULL, 1); - atomic_inc(&multiplex_counter); - return 0; } diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c index a48294a8ebe8..dc8a0428260d 100644 --- a/drivers/oprofile/oprof.c +++ b/drivers/oprofile/oprof.c @@ -107,8 +107,11 @@ static void stop_switch_worker(void) static void switch_worker(struct work_struct *work) { - if (!oprofile_ops.switch_events()) - start_switch_worker(); + if (oprofile_ops.switch_events()) + return; + + atomic_inc(&oprofile_stats.multiplex_counter); + start_switch_worker(); } /* User inputs in ms, converts to jiffies */ diff --git a/drivers/oprofile/oprofile_stats.c b/drivers/oprofile/oprofile_stats.c index 77a57a6792f6..61689e814d46 100644 --- a/drivers/oprofile/oprofile_stats.c +++ b/drivers/oprofile/oprofile_stats.c @@ -16,9 +16,6 @@ #include "cpu_buffer.h" struct oprofile_stat_struct oprofile_stats; -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX -atomic_t multiplex_counter; -#endif void oprofile_reset_stats(void) { @@ -37,9 +34,7 @@ void oprofile_reset_stats(void) atomic_set(&oprofile_stats.sample_lost_no_mapping, 0); atomic_set(&oprofile_stats.event_lost_overflow, 0); atomic_set(&oprofile_stats.bt_lost_no_mapping, 0); -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - atomic_set(&multiplex_counter, 0); -#endif + atomic_set(&oprofile_stats.multiplex_counter, 0); } @@ -84,6 +79,6 @@ void oprofile_create_stats_files(struct super_block *sb, struct dentry *root) &oprofile_stats.bt_lost_no_mapping); #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX oprofilefs_create_ro_atomic(sb, dir, "multiplex_counter", - &multiplex_counter); + &oprofile_stats.multiplex_counter); #endif } diff --git a/drivers/oprofile/oprofile_stats.h b/drivers/oprofile/oprofile_stats.h index 3da0d08dc1f9..0b54e46c3c14 100644 --- a/drivers/oprofile/oprofile_stats.h +++ b/drivers/oprofile/oprofile_stats.h @@ -17,6 +17,7 @@ struct oprofile_stat_struct { atomic_t sample_lost_no_mapping; atomic_t bt_lost_no_mapping; atomic_t event_lost_overflow; + atomic_t multiplex_counter; }; extern struct oprofile_stat_struct oprofile_stats; -- cgit v1.2.3 From 61d149d5248ad7428801cdede0f5fcc2b90cd61c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 10 Jul 2009 15:47:17 +0200 Subject: x86/oprofile: Implement op_x86_virt_to_phys() This patch implements a common x86 function to convert virtual counter numbers to physical. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 6 ++++++ arch/x86/oprofile/op_model_amd.c | 2 +- arch/x86/oprofile/op_x86_model.h | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 7b3362f9abdb..5856e61cb098 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -129,6 +129,11 @@ inline int op_x86_phys_to_virt(int phys) return __get_cpu_var(switch_index) + phys; } +inline int op_x86_virt_to_phys(int virt) +{ + return virt % model->num_counters; +} + static void nmi_shutdown_mux(void) { int i; @@ -270,6 +275,7 @@ static void mux_clone(int cpu) #else inline int op_x86_phys_to_virt(int phys) { return phys; } +inline int op_x86_virt_to_phys(int virt) { return virt; } static inline void nmi_shutdown_mux(void) { } static inline int nmi_setup_mux(void) { return 1; } static inline void diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index dce69b5979e6..1ea19829d985 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -81,7 +81,7 @@ static void op_mux_fill_in_addresses(struct op_msrs * const msrs) int i; for (i = 0; i < NUM_VIRT_COUNTERS; i++) { - int hw_counter = i % NUM_COUNTERS; + int hw_counter = op_x86_virt_to_phys(i); if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) msrs->multiplex[i].addr = MSR_K7_PERFCTR0 + hw_counter; else diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 4e2e7c2c519f..b83776180c7f 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -60,6 +60,7 @@ struct op_counter_config; extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, struct op_counter_config *counter_config); extern int op_x86_phys_to_virt(int phys); +extern int op_x86_virt_to_phys(int virt); extern struct op_x86_model_spec op_ppro_spec; extern struct op_x86_model_spec op_p4_spec; -- cgit v1.2.3 From 11be1a7b54283021777f409aa983ce125945e67c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 10 Jul 2009 18:15:21 +0200 Subject: x86/oprofile: Add counter reservation check for virtual counters This patch adds a check for the availability of a counter. A virtual counter is used only if its physical counter is not reserved. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 5856e61cb098..cb88b1a0bd5f 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -435,15 +435,13 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root) struct dentry *dir; char buf[4]; -#ifndef CONFIG_OPROFILE_EVENT_MULTIPLEX /* quick little hack to _not_ expose a counter if it is not * available for use. This should protect userspace app. * NOTE: assumes 1:1 mapping here (that counters are organized * sequentially in their struct assignment). */ - if (unlikely(!avail_to_resrv_perfctr_nmi_bit(i))) + if (!avail_to_resrv_perfctr_nmi_bit(op_x86_virt_to_phys(i))) continue; -#endif /* CONFIG_OPROFILE_EVENT_MULTIPLEX */ snprintf(buf, sizeof(buf), "%d", i); dir = oprofilefs_mkdir(sb, root, buf); -- cgit v1.2.3 From c550091edd6fac2ed9dac1b30d986b6c58b216fa Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 16 Jul 2009 13:11:16 +0200 Subject: x86/oprofile: Small coding style fixes Some small coding style fixes. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 1ea19829d985..827beecb67a4 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -144,11 +144,10 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, /* setup reset_value */ for (i = 0; i < NUM_VIRT_COUNTERS; ++i) { - if (counter_config[i].enabled) { + if (counter_config[i].enabled) reset_value[i] = counter_config[i].count; - } else { + else reset_value[i] = 0; - } } /* clear all counters */ -- cgit v1.2.3 From 68fd60a8c8bca6af51805c45f286f0f2572ac977 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 16 Jul 2009 10:53:34 +0800 Subject: tracing/events: add missing type info of dynamic arrays The format file doesn't contain enough information for __dynamic_array/__string. The type name is missing. Before: # cat format: name: irq_handler_entry ... field:__data_loc name; offset:16; size:2; After: # cat format: name: irq_handler_entry ... field:__data_loc char[] name; offset:16; size:2; Signed-off-by: Lai Jiangshan LKML-Reference: <4A5E962E.9020900@cn.fujitsu.com> Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- include/trace/ftrace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 1867553c61e5..cc78943e0038 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -120,7 +120,7 @@ #undef __dynamic_array #define __dynamic_array(type, item, len) \ - ret = trace_seq_printf(s, "\tfield:__data_loc " #item ";\t" \ + ret = trace_seq_printf(s, "\tfield:__data_loc " #type "[] " #item ";\t"\ "offset:%u;\tsize:%u;\n", \ (unsigned int)offsetof(typeof(field), \ __data_loc_##item), \ @@ -279,7 +279,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ #undef __dynamic_array #define __dynamic_array(type, item, len) \ - ret = trace_define_field(event_call, "__data_loc" "[" #type "]", #item,\ + ret = trace_define_field(event_call, "__data_loc " #type "[]", #item, \ offsetof(typeof(field), __data_loc_##item), \ sizeof(field.__data_loc_##item), 0); -- cgit v1.2.3 From 7d536cb3fb9993bdcd5a2fbaa6b0670ded4e101c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 16 Jul 2009 10:54:02 +0800 Subject: tracing/events: record the size of dynamic arrays When a dynamic array is defined, we add __data_loc_foo in trace_entry to record the offset of the array, but the size of the array is not recorded, which causes 2 problems: - the event filter just compares the first 2 chars of the strings. - parsers can't parse dynamic arrays. So we encode the size of each dynamic array in the higher 16 bits of __data_loc_foo, while the offset is in lower 16 bits. Signed-off-by: Li Zefan LKML-Reference: <4A5E964A.9000403@cn.fujitsu.com> Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- include/trace/ftrace.h | 14 ++++++++------ kernel/trace/trace_events_filter.c | 6 ++++-- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index cc78943e0038..3cbb96ef34f4 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -25,7 +25,7 @@ #define __array(type, item, len) type item[len]; #undef __dynamic_array -#define __dynamic_array(type, item, len) unsigned short __data_loc_##item; +#define __dynamic_array(type, item, len) u32 __data_loc_##item; #undef __string #define __string(item, src) __dynamic_array(char, item, -1) @@ -51,13 +51,14 @@ * Include the following: * * struct ftrace_data_offsets_ { - * int ; - * int ; + * u32 ; + * u32 ; * [...] * }; * - * The __dynamic_array() macro will create each int , this is + * The __dynamic_array() macro will create each u32 , this is * to keep the offset of each array from the beginning of the event. + * The size of an array is also encoded, in the higher 16 bits of . */ #undef __field @@ -67,7 +68,7 @@ #define __array(type, item, len) #undef __dynamic_array -#define __dynamic_array(type, item, len) int item; +#define __dynamic_array(type, item, len) u32 item; #undef __string #define __string(item, src) __dynamic_array(char, item, -1) @@ -207,7 +208,7 @@ ftrace_format_##call(struct trace_seq *s) \ #undef __get_dynamic_array #define __get_dynamic_array(field) \ - ((void *)__entry + __entry->__data_loc_##field) + ((void *)__entry + (__entry->__data_loc_##field & 0xffff)) #undef __get_str #define __get_str(field) (char *)__get_dynamic_array(field) @@ -325,6 +326,7 @@ ftrace_define_fields_##call(void) \ #define __dynamic_array(type, item, len) \ __data_offsets->item = __data_size + \ offsetof(typeof(*entry), __data); \ + __data_offsets->item |= (len * sizeof(type)) << 16; \ __data_size += (len) * sizeof(type); #undef __string diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index b9aae72d13db..1c80ef702b83 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -176,11 +176,13 @@ static int filter_pred_string(struct filter_pred *pred, void *event, static int filter_pred_strloc(struct filter_pred *pred, void *event, int val1, int val2) { - unsigned short str_loc = *(unsigned short *)(event + pred->offset); + u32 str_item = *(u32 *)(event + pred->offset); + int str_loc = str_item & 0xffff; + int str_len = str_item >> 16; char *addr = (char *)(event + str_loc); int cmp, match; - cmp = strncmp(addr, pred->str_val, pred->str_len); + cmp = strncmp(addr, pred->str_val, str_len); match = (!cmp) ^ pred->not; -- cgit v1.2.3 From ff4e9da2330beb8d64498a513d3f9694e941b01a Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 22 Jun 2009 10:33:07 +0800 Subject: tracing: cleanup for tracing_trace_options_read() '\n' is already appended, and what we need is just an extra space for the '\0'. Signed-off-by: Xiao Guangrong LKML-Reference: <4A3EED63.3090908@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e30e6b1dbd4e..38a4a3ee749d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2256,8 +2256,8 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf, len += 3; /* "no" and newline */ } - /* +2 for \n and \0 */ - buf = kmalloc(len + 2, GFP_KERNEL); + /* +1 for \0 */ + buf = kmalloc(len + 1, GFP_KERNEL); if (!buf) { mutex_unlock(&trace_types_lock); return -ENOMEM; @@ -2280,7 +2280,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf, } mutex_unlock(&trace_types_lock); - WARN_ON(r >= len + 2); + WARN_ON(r >= len + 1); r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -- cgit v1.2.3 From 1f9963cbb0280e0cd554161e00f1a0eeddbf1ae1 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 20 Jul 2009 10:20:53 +0800 Subject: tracing/filters: improve subsystem filter Currently a subsystem filter should be applicable to all events under the subsystem, and if it failed, all the event filters will be cleared. Those behaviors make subsys filter much less useful: # echo 'vec == 1' > irq/softirq_entry/filter # echo 'irq == 5' > irq/filter bash: echo: write error: Invalid argument # cat irq/softirq_entry/filter none I'd expect it set the filter for irq_handler_entry/exit, and not touch softirq_entry/exit. The basic idea is, try to see if the filter can be applied to which events, and then just apply to the those events: # echo 'vec == 1' > softirq_entry/filter # echo 'irq == 5' > filter # cat irq_handler_entry/filter irq == 5 # cat softirq_entry/filter vec == 1 Changelog for v2: - do some cleanups to address Frederic's comments. Inspired-by: Steven Rostedt Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker LKML-Reference: <4A63D485.7030703@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 4 +- kernel/trace/trace.h | 3 +- kernel/trace/trace_events_filter.c | 124 ++++++++++++++++++++++++------------- 3 files changed, 87 insertions(+), 44 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 5c093ffc655b..26d3673d5143 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -101,6 +101,8 @@ void trace_current_buffer_discard_commit(struct ring_buffer_event *event); void tracing_record_cmdline(struct task_struct *tsk); +struct event_filter; + struct ftrace_event_call { struct list_head list; char *name; @@ -116,7 +118,7 @@ struct ftrace_event_call { int (*define_fields)(void); struct list_head fields; int filter_active; - void *filter; + struct event_filter *filter; void *mod; #ifdef CONFIG_EVENT_PROFILE diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 94305c7bc11c..758b0dbed552 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -750,13 +750,14 @@ struct event_filter { int n_preds; struct filter_pred **preds; char *filter_string; + bool no_reset; }; struct event_subsystem { struct list_head list; const char *name; struct dentry *entry; - void *filter; + struct event_filter *filter; int nr_events; }; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 1c80ef702b83..27c2dbea3710 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -420,7 +420,14 @@ oom: } EXPORT_SYMBOL_GPL(init_preds); -static void filter_free_subsystem_preds(struct event_subsystem *system) +enum { + FILTER_DISABLE_ALL, + FILTER_INIT_NO_RESET, + FILTER_SKIP_NO_RESET, +}; + +static void filter_free_subsystem_preds(struct event_subsystem *system, + int flag) { struct ftrace_event_call *call; @@ -428,6 +435,14 @@ static void filter_free_subsystem_preds(struct event_subsystem *system) if (!call->define_fields) continue; + if (flag == FILTER_INIT_NO_RESET) { + call->filter->no_reset = false; + continue; + } + + if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset) + continue; + if (!strcmp(call->system, system->name)) { filter_disable_preds(call); remove_filter_string(call->filter); @@ -529,7 +544,8 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size, static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_call *call, - struct filter_pred *pred) + struct filter_pred *pred, + bool dry_run) { struct ftrace_event_field *field; filter_pred_fn_t fn; @@ -541,10 +557,12 @@ static int filter_add_pred(struct filter_parse_state *ps, if (pred->op == OP_AND) { pred->pop_n = 2; - return filter_add_pred_fn(ps, call, pred, filter_pred_and); + fn = filter_pred_and; + goto add_pred_fn; } else if (pred->op == OP_OR) { pred->pop_n = 2; - return filter_add_pred_fn(ps, call, pred, filter_pred_or); + fn = filter_pred_or; + goto add_pred_fn; } field = find_event_field(call, pred->field_name); @@ -567,9 +585,6 @@ static int filter_add_pred(struct filter_parse_state *ps, else fn = filter_pred_strloc; pred->str_len = field->size; - if (pred->op == OP_NE) - pred->not = 1; - return filter_add_pred_fn(ps, call, pred, fn); } else { if (field->is_signed) ret = strict_strtoll(pred->str_val, 0, &val); @@ -580,27 +595,33 @@ static int filter_add_pred(struct filter_parse_state *ps, return -EINVAL; } pred->val = val; - } - fn = select_comparison_fn(pred->op, field->size, field->is_signed); - if (!fn) { - parse_error(ps, FILT_ERR_INVALID_OP, 0); - return -EINVAL; + fn = select_comparison_fn(pred->op, field->size, + field->is_signed); + if (!fn) { + parse_error(ps, FILT_ERR_INVALID_OP, 0); + return -EINVAL; + } } if (pred->op == OP_NE) pred->not = 1; - return filter_add_pred_fn(ps, call, pred, fn); +add_pred_fn: + if (!dry_run) + return filter_add_pred_fn(ps, call, pred, fn); + return 0; } static int filter_add_subsystem_pred(struct filter_parse_state *ps, struct event_subsystem *system, struct filter_pred *pred, - char *filter_string) + char *filter_string, + bool dry_run) { struct ftrace_event_call *call; int err = 0; + bool fail = true; list_for_each_entry(call, &ftrace_events, list) { @@ -610,16 +631,24 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, if (strcmp(call->system, system->name)) continue; - err = filter_add_pred(ps, call, pred); - if (err) { - filter_free_subsystem_preds(system); - parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - goto out; - } - replace_filter_string(call->filter, filter_string); + if (call->filter->no_reset) + continue; + + err = filter_add_pred(ps, call, pred, dry_run); + if (err) + call->filter->no_reset = true; + else + fail = false; + + if (!dry_run) + replace_filter_string(call->filter, filter_string); } -out: - return err; + + if (fail) { + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + return err; + } + return 0; } static void parse_init(struct filter_parse_state *ps, @@ -978,12 +1007,14 @@ static int check_preds(struct filter_parse_state *ps) static int replace_preds(struct event_subsystem *system, struct ftrace_event_call *call, struct filter_parse_state *ps, - char *filter_string) + char *filter_string, + bool dry_run) { char *operand1 = NULL, *operand2 = NULL; struct filter_pred *pred; struct postfix_elt *elt; int err; + int n_preds = 0; err = check_preds(ps); if (err) @@ -1002,19 +1033,14 @@ static int replace_preds(struct event_subsystem *system, continue; } + if (n_preds++ == MAX_FILTER_PRED) { + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); + return -ENOSPC; + } + if (elt->op == OP_AND || elt->op == OP_OR) { pred = create_logical_pred(elt->op); - if (call) - err = filter_add_pred(ps, call, pred); - else - err = filter_add_subsystem_pred(ps, system, - pred, filter_string); - filter_free_pred(pred); - if (err) - return err; - - operand1 = operand2 = NULL; - continue; + goto add_pred; } if (!operand1 || !operand2) { @@ -1023,11 +1049,12 @@ static int replace_preds(struct event_subsystem *system, } pred = create_pred(elt->op, operand1, operand2); +add_pred: if (call) - err = filter_add_pred(ps, call, pred); + err = filter_add_pred(ps, call, pred, false); else err = filter_add_subsystem_pred(ps, system, pred, - filter_string); + filter_string, dry_run); filter_free_pred(pred); if (err) return err; @@ -1068,7 +1095,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) goto out; } - err = replace_preds(NULL, call, ps, filter_string); + err = replace_preds(NULL, call, ps, filter_string, false); if (err) append_filter_err(ps, call->filter); @@ -1092,7 +1119,7 @@ int apply_subsystem_event_filter(struct event_subsystem *system, mutex_lock(&event_mutex); if (!strcmp(strstrip(filter_string), "0")) { - filter_free_subsystem_preds(system); + filter_free_subsystem_preds(system, FILTER_DISABLE_ALL); remove_filter_string(system->filter); mutex_unlock(&event_mutex); return 0; @@ -1103,7 +1130,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system, if (!ps) goto out_unlock; - filter_free_subsystem_preds(system); replace_filter_string(system->filter, filter_string); parse_init(ps, filter_ops, filter_string); @@ -1113,9 +1139,23 @@ int apply_subsystem_event_filter(struct event_subsystem *system, goto out; } - err = replace_preds(system, NULL, ps, filter_string); - if (err) + filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET); + + /* try to see the filter can be applied to which events */ + err = replace_preds(system, NULL, ps, filter_string, true); + if (err) { + append_filter_err(ps, system->filter); + goto out; + } + + filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET); + + /* really apply the filter to the events */ + err = replace_preds(system, NULL, ps, filter_string, false); + if (err) { append_filter_err(ps, system->filter); + filter_free_subsystem_preds(system, 2); + } out: filter_opstack_clear(ps); -- cgit v1.2.3 From fbd90375d7531927d312766b548376d909811b4d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jul 2009 13:40:14 +0200 Subject: hrtimer: Remove cb_entry from struct hrtimer It's unused, remove it. Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner LKML-Reference: --- include/linux/hrtimer.h | 2 -- kernel/hrtimer.c | 1 - 2 files changed, 3 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 54648e625efd..40e7d54fc424 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -91,7 +91,6 @@ enum hrtimer_restart { * @function: timer expiry callback function * @base: pointer to the timer base (per cpu and per clock) * @state: state information (See bit values above) - * @cb_entry: list head to enqueue an expired timer into the callback list * @start_site: timer statistics field to store the site where the timer * was started * @start_comm: timer statistics field to store the name of the process which @@ -108,7 +107,6 @@ struct hrtimer { enum hrtimer_restart (*function)(struct hrtimer *); struct hrtimer_clock_base *base; unsigned long state; - struct list_head cb_entry; #ifdef CONFIG_TIMER_STATS int start_pid; void *start_site; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 43d151f185b6..052a0f53e4eb 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1092,7 +1092,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, clock_id = CLOCK_MONOTONIC; timer->base = &cpu_base->clock_base[clock_id]; - INIT_LIST_HEAD(&timer->cb_entry); hrtimer_init_timer_hres(timer); #ifdef CONFIG_TIMER_STATS -- cgit v1.2.3 From d857ace143df3884954887e1899a65831ca72ece Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 22 Jul 2009 11:21:31 +0800 Subject: tracing/ksym_tracer: fix the output of ksym tracer Fix the output format of ksym tracer, make it properly aligned Befor patch: # tracer: ksym_tracer # # TASK-PID CPU# Symbol Type Function # | | | | | bash 1378 1 ksym_tracer_mutex W mutex_lock+0x11/0x27 bash 1378 1 ksym_filter_head W process_new_ksym_entry+0xd2/0x10c bash 1378 1 ksym_tracer_mutex W mutex_unlock+0x12/0x1b cat 1429 0 ksym_tracer_mutex W mutex_lock+0x11/0x27 After patch: # tracer: ksym_tracer # # TASK-PID CPU# Symbol Type Function # | | | | | cat-1423 [000] ksym_tracer_mutex RW mutex_lock+0x11/0x27 cat-1423 [000] ksym_filter_head RW ksym_trace_filter_read+0x6e/0x10d cat-1423 [000] ksym_tracer_mutex RW mutex_unlock+0x12/0x1b cat-1423 [000] ksym_tracer_mutex RW mutex_lock+0x11/0x27 cat-1423 [000] ksym_filter_head RW ksym_trace_filter_read+0x6e/0x10d cat-1423 [000] ksym_tracer_mutex RW mutex_unlock+0x12/0x1b Signed-off-by: Xiao Guangrong LKML-Reference: <4A6685BB.2090809@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_ksym.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index 1256a6e8ee24..fbf3a8e13bc5 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -370,13 +370,12 @@ static int ksym_trace_init(struct trace_array *tr) static void ksym_trace_print_header(struct seq_file *m) { - seq_puts(m, - "# TASK-PID CPU# Symbol Type " - "Function \n"); + "# TASK-PID CPU# Symbol " + "Type Function\n"); seq_puts(m, - "# | | | | " - "| \n"); + "# | | | " + " | |\n"); } static enum print_line_t ksym_trace_output(struct trace_iterator *iter) @@ -392,7 +391,7 @@ static enum print_line_t ksym_trace_output(struct trace_iterator *iter) trace_assign_type(field, entry); - ret = trace_seq_printf(s, "%-15s %-5d %-3d %-20s ", field->cmd, + ret = trace_seq_printf(s, "%11s-%-5d [%03d] %-30s ", field->cmd, entry->pid, iter->cpu, field->ksym_name); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -412,7 +411,7 @@ static enum print_line_t ksym_trace_output(struct trace_iterator *iter) return TRACE_TYPE_PARTIAL_LINE; sprint_symbol(str, field->ip); - ret = trace_seq_printf(s, "%-20s\n", str); + ret = trace_seq_printf(s, "%s\n", str); if (!ret) return TRACE_TYPE_PARTIAL_LINE; -- cgit v1.2.3 From 8e068542a8d9efec55126284d2f5cb32f003d507 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 22 Jul 2009 11:23:41 +0800 Subject: tracing/ksym_tracer: fix write operation of ksym_trace_filter This patch fix 2 bugs: - fix the return value of ksym_trace_filter_write() when we want to clear symbol in ksym_trace_filter file for example: # echo global_trace:rw- > /debug/tracing/ksym_trace_filter # echo global_trace:--- > /debug/tracing/ksym_trace_filter -bash: echo: write error: Invalid argument # cat /debug/tracing/ksym_trace_filter # We want to clear 'global_trace' in ksym_trace_filter, it complain with "Invalid argument", but the operation is successful - the "r--" access types is not allowed, but ksym_trace_filter file think it OK for example: # echo ksym_tracer_mutex:r-- > ksym_trace_filter -bash: echo: write error: Resource temporarily unavailable # dmesg ksym_tracer request failed. Try again later!! The error occur at register_kernel_hw_breakpoint(), but It's should at access types parser Signed-off-by: Xiao Guangrong LKML-Reference: <4A66863D.5090802@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_ksym.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index fbf3a8e13bc5..cd5cb656c3d2 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -135,6 +135,9 @@ static int ksym_trace_get_access_type(char *str) case 6: access = HW_BREAKPOINT_RW; break; + case 4: + access = -EINVAL; + break; case 2: access = HW_BREAKPOINT_WRITE; break; @@ -312,6 +315,7 @@ static ssize_t ksym_trace_filter_write(struct file *file, kfree(entry->ksym_hbp->info.name); kfree(entry->ksym_hbp); kfree(entry); + ret = 0; goto out; } else { /* Check for malformed request: (4) */ -- cgit v1.2.3 From 75e33751ca8bbb72dd6f1a74d2810ddc8cbe4bdf Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 23 Jul 2009 12:01:22 +0800 Subject: tracing/ksym_tracer: support quick clear for ksym_trace_filter -- v2 It's rather boring to clear symbol one by one in ksym_trace_filter file, so, this patch will let ksym_trace_filter file support quickly clear all break points. We can write "0" to this file and it will clear all symbols for example: # cat ksym_trace_filter ksym_filter_head:rw- global_trace:rw- # echo 0 > ksym_trace_filter # cat ksym_trace_filter # Changelog v1->v2: Add other ways to clear all breakpoints by writing NULL or "*:---" to ksym_trace_filter file base on K.Prasad's suggestion Signed-off-by: Xiao Guangrong LKML-Reference: <4A67E092.3080202@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_ksym.c | 53 +++++++++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index cd5cb656c3d2..2fde875ead4c 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -163,8 +163,6 @@ static int parse_ksym_trace_str(char *input_string, char **ksymname, { int ret; - strstrip(input_string); - *ksymname = strsep(&input_string, ":"); *addr = kallsyms_lookup_name(*ksymname); @@ -262,6 +260,25 @@ static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf, return cnt; } +static void __ksym_trace_reset(void) +{ + struct trace_ksym *entry; + struct hlist_node *node, *node1; + + mutex_lock(&ksym_tracer_mutex); + hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head, + ksym_hlist) { + unregister_kernel_hw_breakpoint(entry->ksym_hbp); + ksym_filter_entry_count--; + hlist_del_rcu(&(entry->ksym_hlist)); + synchronize_rcu(); + kfree(entry->ksym_hbp->info.name); + kfree(entry->ksym_hbp); + kfree(entry); + } + mutex_unlock(&ksym_tracer_mutex); +} + static ssize_t ksym_trace_filter_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) @@ -282,6 +299,21 @@ static ssize_t ksym_trace_filter_write(struct file *file, } input_string[count] = '\0'; + strstrip(input_string); + + /* + * Clear all breakpoints if: + * 1: echo > ksym_trace_filter + * 2: echo 0 > ksym_trace_filter + * 3: echo "*:---" > ksym_trace_filter + */ + if (!input_string[0] || !strcmp(input_string, "0") || + !strcmp(input_string, "*:---")) { + __ksym_trace_reset(); + kfree(input_string); + return count; + } + ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr); if (ret < 0) { kfree(input_string); @@ -341,23 +373,8 @@ static const struct file_operations ksym_tracing_fops = { static void ksym_trace_reset(struct trace_array *tr) { - struct trace_ksym *entry; - struct hlist_node *node, *node1; - ksym_tracing_enabled = 0; - - mutex_lock(&ksym_tracer_mutex); - hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head, - ksym_hlist) { - unregister_kernel_hw_breakpoint(entry->ksym_hbp); - ksym_filter_entry_count--; - hlist_del_rcu(&(entry->ksym_hlist)); - synchronize_rcu(); - kfree(entry->ksym_hbp->info.name); - kfree(entry->ksym_hbp); - kfree(entry); - } - mutex_unlock(&ksym_tracer_mutex); + __ksym_trace_reset(); } static int ksym_trace_init(struct trace_array *tr) -- cgit v1.2.3 From a004cd42181409eda70804ded240a791f4564d61 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Jul 2009 09:54:05 +0200 Subject: sched: Fix return value of migration_init() migration_init() returns the return value of the hotplug notifier. In the success case this is NOTIFY_OK which is 1. initcall_debug evaluates that as an error code because init calls are expected to return 0 on success. Signed-off-by: Thomas Gleixner --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index 1f7919add8ae..953f037dc053 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7654,7 +7654,7 @@ static int __init migration_init(void) migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); - return err; + return 0; } early_initcall(migration_init); #endif -- cgit v1.2.3 From c94aa5ca3088018d2a7a9bd3258aefffe29df265 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: lockdep: Print the shortest dependency chain if finding a circle Currently lockdep will print the 1st circle detected if it exists when acquiring a new (next) lock. This patch prints the shortest path from the next lock to be acquired to the previous held lock if a circle is found. The patch still uses the current method to check circle, and once the circle is found, breadth-first search algorithem is used to compute the shortest path from the next lock to the previous lock in the forward lock dependency graph. Printing the shortest path will shorten the dependency chain, and make troubleshooting for possible circular locking easier. Signed-off-by: Ming Lei Signed-off-by: Peter Zijlstra LKML-Reference: <1246201486-7308-2-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 6 +++ kernel/lockdep.c | 115 +++++++++++++++++++++++++++++++++++++++++---- kernel/lockdep_internals.h | 83 ++++++++++++++++++++++++++++++++ 3 files changed, 195 insertions(+), 9 deletions(-) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index b25d1b53df0d..9ec026f8d09e 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -149,6 +149,12 @@ struct lock_list { struct lock_class *class; struct stack_trace trace; int distance; + + /*The parent field is used to implement breadth-first search,and + *the bit 0 is reused to indicate if the lock has been accessed + *in BFS. + */ + struct lock_list *parent; }; /* diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 8bbeef996c76..93dc70d18cdf 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -897,6 +897,79 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, return 1; } +static struct circular_queue lock_cq; +static int __search_shortest_path(struct lock_list *source_entry, + struct lock_class *target, + struct lock_list **target_entry, + int forward) +{ + struct lock_list *entry; + struct circular_queue *cq = &lock_cq; + int ret = 1; + + __cq_init(cq); + + mark_lock_accessed(source_entry, NULL); + if (source_entry->class == target) { + *target_entry = source_entry; + ret = 0; + goto exit; + } + + __cq_enqueue(cq, (unsigned long)source_entry); + + while (!__cq_empty(cq)) { + struct lock_list *lock; + struct list_head *head; + + __cq_dequeue(cq, (unsigned long *)&lock); + + if (!lock->class) { + ret = -2; + goto exit; + } + + if (forward) + head = &lock->class->locks_after; + else + head = &lock->class->locks_before; + + list_for_each_entry(entry, head, entry) { + if (!lock_accessed(entry)) { + mark_lock_accessed(entry, lock); + if (entry->class == target) { + *target_entry = entry; + ret = 0; + goto exit; + } + + if (__cq_enqueue(cq, (unsigned long)entry)) { + ret = -1; + goto exit; + } + } + } + } +exit: + return ret; +} + +static inline int __search_forward_shortest_path(struct lock_list *src_entry, + struct lock_class *target, + struct lock_list **target_entry) +{ + return __search_shortest_path(src_entry, target, target_entry, 1); + +} + +static inline int __search_backward_shortest_path(struct lock_list *src_entry, + struct lock_class *target, + struct lock_list **target_entry) +{ + return __search_shortest_path(src_entry, target, target_entry, 0); + +} + /* * Recursive, forwards-direction lock-dependency checking, used for * both noncyclic checking and for hardirq-unsafe/softirq-unsafe @@ -934,7 +1007,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth) { struct task_struct *curr = current; - if (!debug_locks_off_graph_unlock() || debug_locks_silent) + if (debug_locks_silent) return 0; printk("\n=======================================================\n"); @@ -954,19 +1027,41 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth) return 0; } -static noinline int print_circular_bug_tail(void) +static noinline int print_circular_bug(void) { struct task_struct *curr = current; struct lock_list this; + struct lock_list *target; + struct lock_list *parent; + int result; + unsigned long depth; - if (debug_locks_silent) + if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; this.class = hlock_class(check_source); if (!save_trace(&this.trace)) return 0; - print_circular_bug_entry(&this, 0); + result = __search_forward_shortest_path(&this, + hlock_class(check_target), + &target); + if (result) { + printk("\n%s:search shortest path failed:%d\n", __func__, + result); + return 0; + } + + depth = get_lock_depth(target); + + print_circular_bug_header(target, depth); + + parent = get_lock_parent(target); + + while (parent) { + print_circular_bug_entry(parent, --depth); + parent = get_lock_parent(parent); + } printk("\nother info that might help us debug this:\n\n"); lockdep_print_held_locks(curr); @@ -1072,14 +1167,15 @@ check_noncircular(struct lock_class *source, unsigned int depth) */ list_for_each_entry(entry, &source->locks_after, entry) { if (entry->class == hlock_class(check_target)) - return print_circular_bug_header(entry, depth+1); + return 2; debug_atomic_inc(&nr_cyclic_checks); - if (!check_noncircular(entry->class, depth+1)) - return print_circular_bug_entry(entry, depth+1); + if (check_noncircular(entry->class, depth+1) == 2) + return 2; } return 1; } + #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * Forwards and backwards subgraph searching, for the purposes of @@ -1484,8 +1580,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, */ check_source = next; check_target = prev; - if (!(check_noncircular(hlock_class(next), 0))) - return print_circular_bug_tail(); + if (check_noncircular(hlock_class(next), 0) == 2) + return print_circular_bug(); + if (!check_prev_add_irq(curr, prev, next)) return 0; diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index 699a2ac3a0d7..6f48d37d5be2 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -136,3 +136,86 @@ extern atomic_t nr_find_usage_backwards_recursions; # define debug_atomic_dec(ptr) do { } while (0) # define debug_atomic_read(ptr) 0 #endif + +/* The circular_queue and helpers is used to implement the + * breadth-first search(BFS)algorithem, by which we can build + * the shortest path from the next lock to be acquired to the + * previous held lock if there is a circular between them. + * */ +#define MAX_CIRCULAR_QUE_SIZE 4096UL +struct circular_queue{ + unsigned long element[MAX_CIRCULAR_QUE_SIZE]; + unsigned int front, rear; +}; + +#define LOCK_ACCESSED 1UL +#define LOCK_ACCESSED_MASK (~LOCK_ACCESSED) + +static inline void __cq_init(struct circular_queue *cq) +{ + cq->front = cq->rear = 0; +} + +static inline int __cq_empty(struct circular_queue *cq) +{ + return (cq->front == cq->rear); +} + +static inline int __cq_full(struct circular_queue *cq) +{ + return ((cq->rear + 1)%MAX_CIRCULAR_QUE_SIZE) == cq->front; +} + +static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) +{ + if (__cq_full(cq)) + return -1; + + cq->element[cq->rear] = elem; + cq->rear = (cq->rear + 1)%MAX_CIRCULAR_QUE_SIZE; + return 0; +} + +static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem) +{ + if (__cq_empty(cq)) + return -1; + + *elem = cq->element[cq->front]; + cq->front = (cq->front + 1)%MAX_CIRCULAR_QUE_SIZE; + return 0; +} + +static inline int __cq_get_elem_count(struct circular_queue *cq) +{ + return (cq->rear - cq->front)%MAX_CIRCULAR_QUE_SIZE; +} + +static inline void mark_lock_accessed(struct lock_list *lock, + struct lock_list *parent) +{ + lock->parent = (void *) parent + LOCK_ACCESSED; +} + +static inline unsigned long lock_accessed(struct lock_list *lock) +{ + return (unsigned long)lock->parent & LOCK_ACCESSED; +} + +static inline struct lock_list *get_lock_parent(struct lock_list *child) +{ + return (struct lock_list *) + ((unsigned long)child->parent & LOCK_ACCESSED_MASK); +} + +static inline unsigned long get_lock_depth(struct lock_list *child) +{ + unsigned long depth = 0; + struct lock_list *parent; + + while ((parent = get_lock_parent(child))) { + child = parent; + depth++; + } + return depth; +} -- cgit v1.2.3 From d588e46155e9c51217b9840db1e94a0f594c1af2 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: lockdep: Improve implementation of BFS 1,replace %MAX_CIRCULAR_QUE_SIZE with &(MAX_CIRCULAR_QUE_SIZE-1) since we define MAX_CIRCULAR_QUE_SIZE as power of 2; 2,use bitmap to mark if a lock is accessed in BFS in order to clear it quickly, because we may search a graph many times. Signed-off-by: Ming Lei Signed-off-by: Peter Zijlstra LKML-Reference: <1246201486-7308-3-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 23 ++++++++++++++++------- kernel/lockdep_internals.h | 35 +++++++++++++++++++++++------------ 2 files changed, 39 insertions(+), 19 deletions(-) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 93dc70d18cdf..5dcca26a8263 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -42,7 +42,7 @@ #include #include #include - +#include #include #include "lockdep_internals.h" @@ -118,7 +118,7 @@ static inline int debug_locks_off_graph_unlock(void) static int lockdep_initialized; unsigned long nr_list_entries; -static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; +struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; /* * All data structures here are protected by the global debug_lock. @@ -897,30 +897,38 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, return 1; } +unsigned long bfs_accessed[BITS_TO_LONGS(MAX_LOCKDEP_ENTRIES)]; static struct circular_queue lock_cq; + static int __search_shortest_path(struct lock_list *source_entry, struct lock_class *target, struct lock_list **target_entry, int forward) { struct lock_list *entry; + struct list_head *head; struct circular_queue *cq = &lock_cq; int ret = 1; - __cq_init(cq); - - mark_lock_accessed(source_entry, NULL); if (source_entry->class == target) { *target_entry = source_entry; ret = 0; goto exit; } + if (forward) + head = &source_entry->class->locks_after; + else + head = &source_entry->class->locks_before; + + if (list_empty(head)) + goto exit; + + __cq_init(cq); __cq_enqueue(cq, (unsigned long)source_entry); while (!__cq_empty(cq)) { struct lock_list *lock; - struct list_head *head; __cq_dequeue(cq, (unsigned long *)&lock); @@ -1040,6 +1048,7 @@ static noinline int print_circular_bug(void) return 0; this.class = hlock_class(check_source); + this.parent = NULL; if (!save_trace(&this.trace)) return 0; @@ -1580,10 +1589,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, */ check_source = next; check_target = prev; + if (check_noncircular(hlock_class(next), 0) == 2) return print_circular_bug(); - if (!check_prev_add_irq(curr, prev, next)) return 0; diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index 6f48d37d5be2..c2f6594966f3 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -137,23 +137,28 @@ extern atomic_t nr_find_usage_backwards_recursions; # define debug_atomic_read(ptr) 0 #endif + +extern unsigned long nr_list_entries; +extern struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; +extern unsigned long bfs_accessed[]; + +/*For good efficiency of modular, we use power of 2*/ +#define MAX_CIRCULAR_QUE_SIZE 4096UL + /* The circular_queue and helpers is used to implement the * breadth-first search(BFS)algorithem, by which we can build * the shortest path from the next lock to be acquired to the * previous held lock if there is a circular between them. * */ -#define MAX_CIRCULAR_QUE_SIZE 4096UL struct circular_queue{ unsigned long element[MAX_CIRCULAR_QUE_SIZE]; unsigned int front, rear; }; -#define LOCK_ACCESSED 1UL -#define LOCK_ACCESSED_MASK (~LOCK_ACCESSED) - static inline void __cq_init(struct circular_queue *cq) { cq->front = cq->rear = 0; + bitmap_zero(bfs_accessed, MAX_LOCKDEP_ENTRIES); } static inline int __cq_empty(struct circular_queue *cq) @@ -163,7 +168,7 @@ static inline int __cq_empty(struct circular_queue *cq) static inline int __cq_full(struct circular_queue *cq) { - return ((cq->rear + 1)%MAX_CIRCULAR_QUE_SIZE) == cq->front; + return ((cq->rear + 1)&(MAX_CIRCULAR_QUE_SIZE-1)) == cq->front; } static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) @@ -172,7 +177,7 @@ static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) return -1; cq->element[cq->rear] = elem; - cq->rear = (cq->rear + 1)%MAX_CIRCULAR_QUE_SIZE; + cq->rear = (cq->rear + 1)&(MAX_CIRCULAR_QUE_SIZE-1); return 0; } @@ -182,30 +187,36 @@ static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem) return -1; *elem = cq->element[cq->front]; - cq->front = (cq->front + 1)%MAX_CIRCULAR_QUE_SIZE; + cq->front = (cq->front + 1)&(MAX_CIRCULAR_QUE_SIZE-1); return 0; } static inline int __cq_get_elem_count(struct circular_queue *cq) { - return (cq->rear - cq->front)%MAX_CIRCULAR_QUE_SIZE; + return (cq->rear - cq->front)&(MAX_CIRCULAR_QUE_SIZE-1); } static inline void mark_lock_accessed(struct lock_list *lock, struct lock_list *parent) { - lock->parent = (void *) parent + LOCK_ACCESSED; + unsigned long nr; + nr = lock - list_entries; + WARN_ON(nr >= nr_list_entries); + lock->parent = parent; + set_bit(nr, bfs_accessed); } static inline unsigned long lock_accessed(struct lock_list *lock) { - return (unsigned long)lock->parent & LOCK_ACCESSED; + unsigned long nr; + nr = lock - list_entries; + WARN_ON(nr >= nr_list_entries); + return test_bit(nr, bfs_accessed); } static inline struct lock_list *get_lock_parent(struct lock_list *child) { - return (struct lock_list *) - ((unsigned long)child->parent & LOCK_ACCESSED_MASK); + return child->parent; } static inline unsigned long get_lock_depth(struct lock_list *child) -- cgit v1.2.3 From 9e2d551ea0d767c0d624965f0c273e942f4be536 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: lockdep: Introduce match function to BFS 1,introduce match() to BFS in order to make it usable to match different pattern; 2,also rename some functions to make them more suitable. Signed-off-by: Ming Lei Signed-off-by: Peter Zijlstra LKML-Reference: <1246201486-7308-4-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 5dcca26a8263..ce6d09e65ad1 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -900,17 +900,18 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, unsigned long bfs_accessed[BITS_TO_LONGS(MAX_LOCKDEP_ENTRIES)]; static struct circular_queue lock_cq; -static int __search_shortest_path(struct lock_list *source_entry, - struct lock_class *target, - struct lock_list **target_entry, - int forward) +static int __bfs(struct lock_list *source_entry, + void *data, + int (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry, + int forward) { struct lock_list *entry; struct list_head *head; struct circular_queue *cq = &lock_cq; int ret = 1; - if (source_entry->class == target) { + if (match(source_entry, data)) { *target_entry = source_entry; ret = 0; goto exit; @@ -945,7 +946,7 @@ static int __search_shortest_path(struct lock_list *source_entry, list_for_each_entry(entry, head, entry) { if (!lock_accessed(entry)) { mark_lock_accessed(entry, lock); - if (entry->class == target) { + if (match(entry, data)) { *target_entry = entry; ret = 0; goto exit; @@ -962,19 +963,21 @@ exit: return ret; } -static inline int __search_forward_shortest_path(struct lock_list *src_entry, - struct lock_class *target, - struct lock_list **target_entry) +static inline int __bfs_forward(struct lock_list *src_entry, + void *data, + int (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry) { - return __search_shortest_path(src_entry, target, target_entry, 1); + return __bfs(src_entry, data, match, target_entry, 1); } -static inline int __search_backward_shortest_path(struct lock_list *src_entry, - struct lock_class *target, - struct lock_list **target_entry) +static inline int __bfs_backward(struct lock_list *src_entry, + void *data, + int (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry) { - return __search_shortest_path(src_entry, target, target_entry, 0); + return __bfs(src_entry, data, match, target_entry, 0); } @@ -1035,6 +1038,11 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth) return 0; } +static inline int class_equal(struct lock_list *entry, void *data) +{ + return entry->class == data; +} + static noinline int print_circular_bug(void) { struct task_struct *curr = current; @@ -1052,9 +1060,10 @@ static noinline int print_circular_bug(void) if (!save_trace(&this.trace)) return 0; - result = __search_forward_shortest_path(&this, - hlock_class(check_target), - &target); + result = __bfs_forward(&this, + hlock_class(check_target), + class_equal, + &target); if (result) { printk("\n%s:search shortest path failed:%d\n", __func__, result); -- cgit v1.2.3 From db0002a32f31060ca900b533d93a074ddf7d5b61 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: lockdep: Implement check_noncircular() by BFS This patch uses BFS to implement check_noncircular() and prints the generated shortest circle if exists. Signed-off-by: Ming Lei Signed-off-by: Peter Zijlstra LKML-Reference: <1246201486-7308-5-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 89 +++++++++++++++++++++++--------------------------------- 1 file changed, 37 insertions(+), 52 deletions(-) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index ce6d09e65ad1..5609d309d568 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -985,12 +985,7 @@ static inline int __bfs_backward(struct lock_list *src_entry, * Recursive, forwards-direction lock-dependency checking, used for * both noncyclic checking and for hardirq-unsafe/softirq-unsafe * checking. - * - * (to keep the stackframe of the recursive functions small we - * use these global variables, and we also mark various helper - * functions as noinline.) */ -static struct held_lock *check_source, *check_target; /* * Print a dependency chain entry (this is only done when a deadlock @@ -1014,7 +1009,9 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth) * header first: */ static noinline int -print_circular_bug_header(struct lock_list *entry, unsigned int depth) +print_circular_bug_header(struct lock_list *entry, unsigned int depth, + struct held_lock *check_src, + struct held_lock *check_tgt) { struct task_struct *curr = current; @@ -1027,9 +1024,9 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth) printk( "-------------------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); - print_lock(check_source); + print_lock(check_src); printk("\nbut task is already holding lock:\n"); - print_lock(check_target); + print_lock(check_tgt); printk("\nwhich lock already depends on the new lock.\n\n"); printk("\nthe existing dependency chain (in reverse order) is:\n"); @@ -1043,36 +1040,24 @@ static inline int class_equal(struct lock_list *entry, void *data) return entry->class == data; } -static noinline int print_circular_bug(void) +static noinline int print_circular_bug(struct lock_list *this, + struct lock_list *target, + struct held_lock *check_src, + struct held_lock *check_tgt) { struct task_struct *curr = current; - struct lock_list this; - struct lock_list *target; struct lock_list *parent; - int result; unsigned long depth; if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - this.class = hlock_class(check_source); - this.parent = NULL; - if (!save_trace(&this.trace)) + if (!save_trace(&this->trace)) return 0; - result = __bfs_forward(&this, - hlock_class(check_target), - class_equal, - &target); - if (result) { - printk("\n%s:search shortest path failed:%d\n", __func__, - result); - return 0; - } - depth = get_lock_depth(target); - print_circular_bug_header(target, depth); + print_circular_bug_header(target, depth, check_src, check_tgt); parent = get_lock_parent(target); @@ -1090,6 +1075,16 @@ static noinline int print_circular_bug(void) return 0; } +static noinline int print_bfs_bug(int ret) +{ + if (!debug_locks_off_graph_unlock()) + return 0; + + WARN(1, "lockdep bfs error:%d\n", ret); + + return 0; +} + #define RECURSION_LIMIT 40 static int noinline print_infinite_recursion_bug(void) @@ -1168,31 +1163,17 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) * lead to . Print an error and return 0 if it does. */ static noinline int -check_noncircular(struct lock_class *source, unsigned int depth) +check_noncircular(struct lock_list *root, struct lock_class *target, + struct lock_list **target_entry) { - struct lock_list *entry; + int result; - if (lockdep_dependency_visit(source, depth)) - return 1; + debug_atomic_inc(&nr_cyclic_checks); - debug_atomic_inc(&nr_cyclic_check_recursions); - if (depth > max_recursion_depth) - max_recursion_depth = depth; - if (depth >= RECURSION_LIMIT) - return print_infinite_recursion_bug(); - /* - * Check this lock's dependency list: - */ - list_for_each_entry(entry, &source->locks_after, entry) { - if (entry->class == hlock_class(check_target)) - return 2; - debug_atomic_inc(&nr_cyclic_checks); - if (check_noncircular(entry->class, depth+1) == 2) - return 2; - } - return 1; -} + result = __bfs_forward(root, target, class_equal, target_entry); + return result; +} #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* @@ -1586,6 +1567,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, { struct lock_list *entry; int ret; + struct lock_list this; + struct lock_list *uninitialized_var(target_entry); /* * Prove that the new -> dependency would not @@ -1596,11 +1579,13 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * We are using global variables to control the recursion, to * keep the stackframe size of the recursive functions low: */ - check_source = next; - check_target = prev; - - if (check_noncircular(hlock_class(next), 0) == 2) - return print_circular_bug(); + this.class = hlock_class(next); + this.parent = NULL; + ret = check_noncircular(&this, hlock_class(prev), &target_entry); + if (unlikely(!ret)) + return print_circular_bug(&this, target_entry, next, prev); + else if (unlikely(ret < 0)) + return print_bfs_bug(ret); if (!check_prev_add_irq(curr, prev, next)) return 0; -- cgit v1.2.3 From d7aaba140a09b7a2295aec061629c880f088de3d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: lockdep: Implement find_usage_*wards by BFS This patch uses BFS to implement find_usage_*wards(),which was originally writen by DFS. Signed-off-by: Ming Lei Signed-off-by: Peter Zijlstra LKML-Reference: <1246201486-7308-6-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 180 ++++++++++++++++++++++--------------------------------- 1 file changed, 72 insertions(+), 108 deletions(-) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 5609d309d568..b3ade507c6ce 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -963,7 +963,7 @@ exit: return ret; } -static inline int __bfs_forward(struct lock_list *src_entry, +static inline int __bfs_forwards(struct lock_list *src_entry, void *data, int (*match)(struct lock_list *entry, void *data), struct lock_list **target_entry) @@ -972,7 +972,7 @@ static inline int __bfs_forward(struct lock_list *src_entry, } -static inline int __bfs_backward(struct lock_list *src_entry, +static inline int __bfs_backwards(struct lock_list *src_entry, void *data, int (*match)(struct lock_list *entry, void *data), struct lock_list **target_entry) @@ -1085,18 +1085,6 @@ static noinline int print_bfs_bug(int ret) return 0; } -#define RECURSION_LIMIT 40 - -static int noinline print_infinite_recursion_bug(void) -{ - if (!debug_locks_off_graph_unlock()) - return 0; - - WARN_ON(1); - - return 0; -} - unsigned long __lockdep_count_forward_deps(struct lock_class *class, unsigned int depth) { @@ -1170,7 +1158,7 @@ check_noncircular(struct lock_list *root, struct lock_class *target, debug_atomic_inc(&nr_cyclic_checks); - result = __bfs_forward(root, target, class_equal, target_entry); + result = __bfs_forwards(root, target, class_equal, target_entry); return result; } @@ -1181,101 +1169,70 @@ check_noncircular(struct lock_list *root, struct lock_class *target, * proving that two subgraphs can be connected by a new dependency * without creating any illegal irq-safe -> irq-unsafe lock dependency. */ -static enum lock_usage_bit find_usage_bit; static struct lock_class *forwards_match, *backwards_match; + +#define BFS_PROCESS_RET(ret) do { \ + if (ret < 0) \ + return print_bfs_bug(ret); \ + if (ret == 1) \ + return 1; \ + } while (0) + +static inline int usage_match(struct lock_list *entry, void *bit) +{ + return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit); +} + + + /* * Find a node in the forwards-direction dependency sub-graph starting - * at that matches . + * at @root->class that matches @bit. * - * Return 2 if such a node exists in the subgraph, and put that node - * into . + * Return 0 if such a node exists in the subgraph, and put that node + * into *@target_entry. * - * Return 1 otherwise and keep unchanged. - * Return 0 on error. + * Return 1 otherwise and keep *@target_entry unchanged. + * Return <0 on error. */ -static noinline int -find_usage_forwards(struct lock_class *source, unsigned int depth) +static int +find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit, + struct lock_list **target_entry) { - struct lock_list *entry; - int ret; - - if (lockdep_dependency_visit(source, depth)) - return 1; - - if (depth > max_recursion_depth) - max_recursion_depth = depth; - if (depth >= RECURSION_LIMIT) - return print_infinite_recursion_bug(); + int result; debug_atomic_inc(&nr_find_usage_forwards_checks); - if (source->usage_mask & (1 << find_usage_bit)) { - forwards_match = source; - return 2; - } - /* - * Check this lock's dependency list: - */ - list_for_each_entry(entry, &source->locks_after, entry) { - debug_atomic_inc(&nr_find_usage_forwards_recursions); - ret = find_usage_forwards(entry->class, depth+1); - if (ret == 2 || ret == 0) - return ret; - } - return 1; + result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); + + return result; } /* * Find a node in the backwards-direction dependency sub-graph starting - * at that matches . + * at @root->class that matches @bit. * - * Return 2 if such a node exists in the subgraph, and put that node - * into . + * Return 0 if such a node exists in the subgraph, and put that node + * into *@target_entry. * - * Return 1 otherwise and keep unchanged. - * Return 0 on error. + * Return 1 otherwise and keep *@target_entry unchanged. + * Return <0 on error. */ -static noinline int -find_usage_backwards(struct lock_class *source, unsigned int depth) +static int +find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit, + struct lock_list **target_entry) { - struct lock_list *entry; - int ret; - - if (lockdep_dependency_visit(source, depth)) - return 1; - - if (!__raw_spin_is_locked(&lockdep_lock)) - return DEBUG_LOCKS_WARN_ON(1); - - if (depth > max_recursion_depth) - max_recursion_depth = depth; - if (depth >= RECURSION_LIMIT) - return print_infinite_recursion_bug(); + int result; debug_atomic_inc(&nr_find_usage_backwards_checks); - if (source->usage_mask & (1 << find_usage_bit)) { - backwards_match = source; - return 2; - } - if (!source && debug_locks_off_graph_unlock()) { - WARN_ON(1); - return 0; - } + result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); - /* - * Check this lock's dependency list: - */ - list_for_each_entry(entry, &source->locks_before, entry) { - debug_atomic_inc(&nr_find_usage_backwards_recursions); - ret = find_usage_backwards(entry->class, depth+1); - if (ret == 2 || ret == 0) - return ret; - } - return 1; + return result; } + static int print_bad_irq_dependency(struct task_struct *curr, struct held_lock *prev, @@ -1343,18 +1300,21 @@ check_usage(struct task_struct *curr, struct held_lock *prev, enum lock_usage_bit bit_forwards, const char *irqclass) { int ret; + struct lock_list this; + struct lock_list *uninitialized_var(target_entry); + + this.parent = NULL; + + this.class = hlock_class(prev); + ret = find_usage_backwards(&this, bit_backwards, &target_entry); + BFS_PROCESS_RET(ret); + backwards_match = target_entry->class; + + this.class = hlock_class(next); + ret = find_usage_forwards(&this, bit_forwards, &target_entry); + BFS_PROCESS_RET(ret); + forwards_match = target_entry->class; - find_usage_bit = bit_backwards; - /* fills in */ - ret = find_usage_backwards(hlock_class(prev), 0); - if (!ret || ret == 1) - return ret; - - find_usage_bit = bit_forwards; - ret = find_usage_forwards(hlock_class(next), 0); - if (!ret || ret == 1) - return ret; - /* ret == 2 */ return print_bad_irq_dependency(curr, prev, next, bit_backwards, bit_forwards, irqclass); } @@ -2029,14 +1989,16 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit bit, const char *irqclass) { int ret; + struct lock_list root; + struct lock_list *uninitialized_var(target_entry); - find_usage_bit = bit; - /* fills in */ - ret = find_usage_forwards(hlock_class(this), 0); - if (!ret || ret == 1) - return ret; + root.parent = NULL; + root.class = hlock_class(this); + ret = find_usage_forwards(&root, bit, &target_entry); + BFS_PROCESS_RET(ret); - return print_irq_inversion_bug(curr, forwards_match, this, 1, irqclass); + return print_irq_inversion_bug(curr, target_entry->class, + this, 1, irqclass); } /* @@ -2048,14 +2010,16 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit bit, const char *irqclass) { int ret; + struct lock_list root; + struct lock_list *uninitialized_var(target_entry); - find_usage_bit = bit; - /* fills in */ - ret = find_usage_backwards(hlock_class(this), 0); - if (!ret || ret == 1) - return ret; + root.parent = NULL; + root.class = hlock_class(this); + ret = find_usage_backwards(&root, bit, &target_entry); + BFS_PROCESS_RET(ret); - return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass); + return print_irq_inversion_bug(curr, target_entry->class, + this, 1, irqclass); } void print_irqtrace_events(struct task_struct *curr) -- cgit v1.2.3 From 24208ca76707581a097c01a73fd63781e73d3404 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: lockdep: Introduce print_shortest_lock_dependencies Since the shortest lock dependencies' path may be obtained by BFS, we print the shortest one by print_shortest_lock_dependencies(). Signed-off-by: Ming Lei Signed-off-by: Peter Zijlstra LKML-Reference: <1246201486-7308-7-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 93 +++++++++++++++++++++++++++++++++------------- kernel/lockdep_internals.h | 4 +- 2 files changed, 69 insertions(+), 28 deletions(-) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index b3ade507c6ce..938dc500f1aa 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -575,6 +575,36 @@ static void print_lock_class_header(struct lock_class *class, int depth) print_ip_sym((unsigned long)class->key); } +/* + * printk the shortest lock dependencies from @start to @end in reverse order: + */ +static void __used +print_shortest_lock_dependencies(struct lock_list *leaf, + struct lock_list *root) +{ + struct lock_list *entry = leaf; + int depth; + + /*compute depth from generated tree by BFS*/ + depth = get_lock_depth(leaf); + + do { + print_lock_class_header(entry->class, depth); + printk("%*s ... acquired at:\n", depth, ""); + print_stack_trace(&entry->trace, 2); + printk("\n"); + + if (depth == 0 && (entry != root)) { + printk("lockdep:%s bad BFS generated tree\n", __func__); + break; + } + + entry = get_lock_parent(entry); + depth--; + } while (entry && (depth >= 0)); + + return; +} /* * printk all lock dependencies starting at : */ @@ -992,7 +1022,7 @@ static inline int __bfs_backwards(struct lock_list *src_entry, * has been detected): */ static noinline int -print_circular_bug_entry(struct lock_list *target, unsigned int depth) +print_circular_bug_entry(struct lock_list *target, int depth) { if (debug_locks_silent) return 0; @@ -1047,7 +1077,7 @@ static noinline int print_circular_bug(struct lock_list *this, { struct task_struct *curr = current; struct lock_list *parent; - unsigned long depth; + int depth; if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; @@ -1169,7 +1199,6 @@ check_noncircular(struct lock_list *root, struct lock_class *target, * proving that two subgraphs can be connected by a new dependency * without creating any illegal irq-safe -> irq-unsafe lock dependency. */ -static struct lock_class *forwards_match, *backwards_match; #define BFS_PROCESS_RET(ret) do { \ @@ -1235,6 +1264,10 @@ find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit, static int print_bad_irq_dependency(struct task_struct *curr, + struct lock_list *prev_root, + struct lock_list *next_root, + struct lock_list *backwards_entry, + struct lock_list *forwards_entry, struct held_lock *prev, struct held_lock *next, enum lock_usage_bit bit1, @@ -1267,26 +1300,32 @@ print_bad_irq_dependency(struct task_struct *curr, printk("\nbut this new dependency connects a %s-irq-safe lock:\n", irqclass); - print_lock_name(backwards_match); + print_lock_name(backwards_entry->class); printk("\n... which became %s-irq-safe at:\n", irqclass); - print_stack_trace(backwards_match->usage_traces + bit1, 1); + print_stack_trace(backwards_entry->class->usage_traces + bit1, 1); printk("\nto a %s-irq-unsafe lock:\n", irqclass); - print_lock_name(forwards_match); + print_lock_name(forwards_entry->class); printk("\n... which became %s-irq-unsafe at:\n", irqclass); printk("..."); - print_stack_trace(forwards_match->usage_traces + bit2, 1); + print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); printk("\nother info that might help us debug this:\n\n"); lockdep_print_held_locks(curr); - printk("\nthe %s-irq-safe lock's dependencies:\n", irqclass); - print_lock_dependencies(backwards_match, 0); + printk("\nthe dependencies between %s-irq-safe lock", irqclass); + printk(" and the holding lock:\n"); + if (!save_trace(&prev_root->trace)) + return 0; + print_shortest_lock_dependencies(backwards_entry, prev_root); - printk("\nthe %s-irq-unsafe lock's dependencies:\n", irqclass); - print_lock_dependencies(forwards_match, 0); + printk("\nthe dependencies between the lock to be acquired"); + printk(" and %s-irq-unsafe lock:\n", irqclass); + if (!save_trace(&next_root->trace)) + return 0; + print_shortest_lock_dependencies(forwards_entry, next_root); printk("\nstack backtrace:\n"); dump_stack(); @@ -1300,22 +1339,24 @@ check_usage(struct task_struct *curr, struct held_lock *prev, enum lock_usage_bit bit_forwards, const char *irqclass) { int ret; - struct lock_list this; + struct lock_list this, that; struct lock_list *uninitialized_var(target_entry); + struct lock_list *uninitialized_var(target_entry1); this.parent = NULL; this.class = hlock_class(prev); ret = find_usage_backwards(&this, bit_backwards, &target_entry); BFS_PROCESS_RET(ret); - backwards_match = target_entry->class; - this.class = hlock_class(next); - ret = find_usage_forwards(&this, bit_forwards, &target_entry); + that.parent = NULL; + that.class = hlock_class(next); + ret = find_usage_forwards(&that, bit_forwards, &target_entry1); BFS_PROCESS_RET(ret); - forwards_match = target_entry->class; - return print_bad_irq_dependency(curr, prev, next, + return print_bad_irq_dependency(curr, &this, &that, + target_entry, target_entry1, + prev, next, bit_backwards, bit_forwards, irqclass); } @@ -1944,7 +1985,8 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, * print irq inversion bug: */ static int -print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, +print_irq_inversion_bug(struct task_struct *curr, + struct lock_list *root, struct lock_list *other, struct held_lock *this, int forwards, const char *irqclass) { @@ -1962,17 +2004,16 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass); else printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); - print_lock_name(other); + print_lock_name(other->class); printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); printk("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); - printk("\nthe first lock's dependencies:\n"); - print_lock_dependencies(hlock_class(this), 0); - - printk("\nthe second lock's dependencies:\n"); - print_lock_dependencies(other, 0); + printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); + if (!save_trace(&root->trace)) + return 0; + print_shortest_lock_dependencies(other, root); printk("\nstack backtrace:\n"); dump_stack(); @@ -1997,7 +2038,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, ret = find_usage_forwards(&root, bit, &target_entry); BFS_PROCESS_RET(ret); - return print_irq_inversion_bug(curr, target_entry->class, + return print_irq_inversion_bug(curr, &root, target_entry, this, 1, irqclass); } @@ -2018,7 +2059,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, ret = find_usage_backwards(&root, bit, &target_entry); BFS_PROCESS_RET(ret); - return print_irq_inversion_bug(curr, target_entry->class, + return print_irq_inversion_bug(curr, &root, target_entry, this, 1, irqclass); } diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index c2f6594966f3..b115aaa0bf35 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -219,9 +219,9 @@ static inline struct lock_list *get_lock_parent(struct lock_list *child) return child->parent; } -static inline unsigned long get_lock_depth(struct lock_list *child) +static inline int get_lock_depth(struct lock_list *child) { - unsigned long depth = 0; + int depth = 0; struct lock_list *parent; while ((parent = get_lock_parent(child))) { -- cgit v1.2.3 From ef681026ff85c49b7399ceec3eb62bbbcce605e8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: lockdep: Implement lockdep_count_*ward_deps by BFS Implement lockdep_count_{for,back}ward using BFS. Signed-off-by: Ming Lei Signed-off-by: Peter Zijlstra LKML-Reference: <1246201486-7308-8-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 52 +++++++++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 938dc500f1aa..b896f23e00cf 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -1115,61 +1115,59 @@ static noinline int print_bfs_bug(int ret) return 0; } -unsigned long __lockdep_count_forward_deps(struct lock_class *class, - unsigned int depth) +static int noop_count(struct lock_list *entry, void *data) { - struct lock_list *entry; - unsigned long ret = 1; + (*(unsigned long *)data)++; + return 0; +} - if (lockdep_dependency_visit(class, depth)) - return 0; +unsigned long __lockdep_count_forward_deps(struct lock_list *this) +{ + unsigned long count = 0; + struct lock_list *uninitialized_var(target_entry); - /* - * Recurse this class's dependency list: - */ - list_for_each_entry(entry, &class->locks_after, entry) - ret += __lockdep_count_forward_deps(entry->class, depth + 1); + __bfs_forwards(this, (void *)&count, noop_count, &target_entry); - return ret; + return count; } - unsigned long lockdep_count_forward_deps(struct lock_class *class) { unsigned long ret, flags; + struct lock_list this; + + this.parent = NULL; + this.class = class; local_irq_save(flags); __raw_spin_lock(&lockdep_lock); - ret = __lockdep_count_forward_deps(class, 0); + ret = __lockdep_count_forward_deps(&this); __raw_spin_unlock(&lockdep_lock); local_irq_restore(flags); return ret; } -unsigned long __lockdep_count_backward_deps(struct lock_class *class, - unsigned int depth) +unsigned long __lockdep_count_backward_deps(struct lock_list *this) { - struct lock_list *entry; - unsigned long ret = 1; + unsigned long count = 0; + struct lock_list *uninitialized_var(target_entry); - if (lockdep_dependency_visit(class, depth)) - return 0; - /* - * Recurse this class's dependency list: - */ - list_for_each_entry(entry, &class->locks_before, entry) - ret += __lockdep_count_backward_deps(entry->class, depth + 1); + __bfs_backwards(this, (void *)&count, noop_count, &target_entry); - return ret; + return count; } unsigned long lockdep_count_backward_deps(struct lock_class *class) { unsigned long ret, flags; + struct lock_list this; + + this.parent = NULL; + this.class = class; local_irq_save(flags); __raw_spin_lock(&lockdep_lock); - ret = __lockdep_count_backward_deps(class, 0); + ret = __lockdep_count_backward_deps(&this); __raw_spin_unlock(&lockdep_lock); local_irq_restore(flags); -- cgit v1.2.3 From 4dd861d6467007681991d8ec079d928db2018cbb Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: lockdep: Update memory usage introduced by BFS Also account the BFS memory usage. Signed-off-by: Ming Lei [ fix build for !PROVE_LOCKING ] Signed-off-by: Peter Zijlstra LKML-Reference: <1246201486-7308-9-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index b896f23e00cf..6358cf7e84b2 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3429,7 +3429,11 @@ void __init lockdep_info(void) sizeof(struct list_head) * CLASSHASH_SIZE + sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + - sizeof(struct list_head) * CHAINHASH_SIZE) / 1024); + sizeof(struct list_head) * CHAINHASH_SIZE) / 1024 +#ifdef CONFIG_PROVE_LOCKING + + sizeof(struct circular_queue) + sizeof(bfs_accessed) +#endif + ); printk(" per task-struct memory footprint: %lu bytes\n", sizeof(struct held_lock) * MAX_LOCK_DEPTH); -- cgit v1.2.3 From 12f3dfd022d7e616757a94f0538d3d525d806a16 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: lockdep: Add statistics info for max bfs queue depth Add BFS statistics to the existing lockdep stats. Signed-off-by: Ming Lei Signed-off-by: Peter Zijlstra LKML-Reference: <1246201486-7308-10-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 6 +++++- kernel/lockdep_internals.h | 3 ++- kernel/lockdep_proc.c | 2 ++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 6358cf7e84b2..744da6265d99 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -929,7 +929,7 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, unsigned long bfs_accessed[BITS_TO_LONGS(MAX_LOCKDEP_ENTRIES)]; static struct circular_queue lock_cq; - +unsigned int max_bfs_queue_depth; static int __bfs(struct lock_list *source_entry, void *data, int (*match)(struct lock_list *entry, void *data), @@ -975,6 +975,7 @@ static int __bfs(struct lock_list *source_entry, list_for_each_entry(entry, head, entry) { if (!lock_accessed(entry)) { + unsigned int cq_depth; mark_lock_accessed(entry, lock); if (match(entry, data)) { *target_entry = entry; @@ -986,6 +987,9 @@ static int __bfs(struct lock_list *source_entry, ret = -1; goto exit; } + cq_depth = __cq_get_elem_count(cq); + if (max_bfs_queue_depth < cq_depth) + max_bfs_queue_depth = cq_depth; } } } diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index b115aaa0bf35..6baa8807efdd 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -138,6 +138,7 @@ extern atomic_t nr_find_usage_backwards_recursions; #endif +extern unsigned int max_bfs_queue_depth; extern unsigned long nr_list_entries; extern struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; extern unsigned long bfs_accessed[]; @@ -191,7 +192,7 @@ static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem) return 0; } -static inline int __cq_get_elem_count(struct circular_queue *cq) +static inline unsigned int __cq_get_elem_count(struct circular_queue *cq) { return (cq->rear - cq->front)&(MAX_CIRCULAR_QUE_SIZE-1); } diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index d7135aa2d2c4..9a1bf34d2ff6 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -411,6 +411,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v) max_lockdep_depth); seq_printf(m, " max recursion depth: %11u\n", max_recursion_depth); + seq_printf(m, " max bfs queue depth: %11u\n", + max_bfs_queue_depth); lockdep_stats_debug_show(m); seq_printf(m, " debug_locks: %11u\n", debug_locks); -- cgit v1.2.3 From af012961450949ea297b209e091bd1a3805b8a0a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: lockdep: BFS cleanup Some cleanups of the lockdep code after the BFS series: - Remove the last traces of the generation id - Fixup comment style - Move the bfs routines into lockdep.c - Cleanup the bfs routines [ tom.leiming@gmail.com: Fix crash ] Signed-off-by: Peter Zijlstra LKML-Reference: <1246201486-7308-11-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 7 +- kernel/lockdep.c | 284 +++++++++++++++++++++++++++------------------ kernel/lockdep_internals.h | 97 +--------------- 3 files changed, 175 insertions(+), 213 deletions(-) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 9ec026f8d09e..12aabfcb45f6 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -58,7 +58,6 @@ struct lock_class { struct lockdep_subclass_key *key; unsigned int subclass; - unsigned int dep_gen_id; /* * IRQ/softirq usage tracking bits: @@ -150,9 +149,9 @@ struct lock_list { struct stack_trace trace; int distance; - /*The parent field is used to implement breadth-first search,and - *the bit 0 is reused to indicate if the lock has been accessed - *in BFS. + /* + * The parent field is used to implement breadth-first search, and the + * bit 0 is reused to indicate if the lock has been accessed in BFS. */ struct lock_list *parent; }; diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 744da6265d99..1cedb00e3e7a 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -43,6 +43,7 @@ #include #include #include + #include #include "lockdep_internals.h" @@ -118,7 +119,7 @@ static inline int debug_locks_off_graph_unlock(void) static int lockdep_initialized; unsigned long nr_list_entries; -struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; +static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; /* * All data structures here are protected by the global debug_lock. @@ -390,19 +391,6 @@ unsigned int nr_process_chains; unsigned int max_lockdep_depth; unsigned int max_recursion_depth; -static unsigned int lockdep_dependency_gen_id; - -static bool lockdep_dependency_visit(struct lock_class *source, - unsigned int depth) -{ - if (!depth) - lockdep_dependency_gen_id++; - if (source->dep_gen_id == lockdep_dependency_gen_id) - return true; - source->dep_gen_id = lockdep_dependency_gen_id; - return false; -} - #ifdef CONFIG_DEBUG_LOCKDEP /* * We cannot printk in early bootup code. Not even early_printk() @@ -551,88 +539,6 @@ static void lockdep_print_held_locks(struct task_struct *curr) } } -static void print_lock_class_header(struct lock_class *class, int depth) -{ - int bit; - - printk("%*s->", depth, ""); - print_lock_name(class); - printk(" ops: %lu", class->ops); - printk(" {\n"); - - for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { - if (class->usage_mask & (1 << bit)) { - int len = depth; - - len += printk("%*s %s", depth, "", usage_str[bit]); - len += printk(" at:\n"); - print_stack_trace(class->usage_traces + bit, len); - } - } - printk("%*s }\n", depth, ""); - - printk("%*s ... key at: ",depth,""); - print_ip_sym((unsigned long)class->key); -} - -/* - * printk the shortest lock dependencies from @start to @end in reverse order: - */ -static void __used -print_shortest_lock_dependencies(struct lock_list *leaf, - struct lock_list *root) -{ - struct lock_list *entry = leaf; - int depth; - - /*compute depth from generated tree by BFS*/ - depth = get_lock_depth(leaf); - - do { - print_lock_class_header(entry->class, depth); - printk("%*s ... acquired at:\n", depth, ""); - print_stack_trace(&entry->trace, 2); - printk("\n"); - - if (depth == 0 && (entry != root)) { - printk("lockdep:%s bad BFS generated tree\n", __func__); - break; - } - - entry = get_lock_parent(entry); - depth--; - } while (entry && (depth >= 0)); - - return; -} -/* - * printk all lock dependencies starting at : - */ -static void __used -print_lock_dependencies(struct lock_class *class, int depth) -{ - struct lock_list *entry; - - if (lockdep_dependency_visit(class, depth)) - return; - - if (DEBUG_LOCKS_WARN_ON(depth >= 20)) - return; - - print_lock_class_header(class, depth); - - list_for_each_entry(entry, &class->locks_after, entry) { - if (DEBUG_LOCKS_WARN_ON(!entry->class)) - return; - - print_lock_dependencies(entry->class, depth + 1); - - printk("%*s ... acquired at:\n",depth,""); - print_stack_trace(&entry->trace, 2); - printk("\n"); - } -} - static void print_kernel_version(void) { printk("%s %.*s\n", init_utsname()->release, @@ -927,14 +833,106 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, return 1; } -unsigned long bfs_accessed[BITS_TO_LONGS(MAX_LOCKDEP_ENTRIES)]; -static struct circular_queue lock_cq; +/*For good efficiency of modular, we use power of 2*/ +#define MAX_CIRCULAR_QUEUE_SIZE 4096UL +#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1) + +/* The circular_queue and helpers is used to implement the + * breadth-first search(BFS)algorithem, by which we can build + * the shortest path from the next lock to be acquired to the + * previous held lock if there is a circular between them. + * */ +struct circular_queue { + unsigned long element[MAX_CIRCULAR_QUEUE_SIZE]; + unsigned int front, rear; +}; + +static struct circular_queue lock_cq; +static unsigned long bfs_accessed[BITS_TO_LONGS(MAX_LOCKDEP_ENTRIES)]; + unsigned int max_bfs_queue_depth; + +static inline void __cq_init(struct circular_queue *cq) +{ + cq->front = cq->rear = 0; + bitmap_zero(bfs_accessed, MAX_LOCKDEP_ENTRIES); +} + +static inline int __cq_empty(struct circular_queue *cq) +{ + return (cq->front == cq->rear); +} + +static inline int __cq_full(struct circular_queue *cq) +{ + return ((cq->rear + 1) & CQ_MASK) == cq->front; +} + +static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) +{ + if (__cq_full(cq)) + return -1; + + cq->element[cq->rear] = elem; + cq->rear = (cq->rear + 1) & CQ_MASK; + return 0; +} + +static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem) +{ + if (__cq_empty(cq)) + return -1; + + *elem = cq->element[cq->front]; + cq->front = (cq->front + 1) & CQ_MASK; + return 0; +} + +static inline unsigned int __cq_get_elem_count(struct circular_queue *cq) +{ + return (cq->rear - cq->front) & CQ_MASK; +} + +static inline void mark_lock_accessed(struct lock_list *lock, + struct lock_list *parent) +{ + unsigned long nr; + nr = lock - list_entries; + WARN_ON(nr >= nr_list_entries); + lock->parent = parent; + set_bit(nr, bfs_accessed); +} + +static inline unsigned long lock_accessed(struct lock_list *lock) +{ + unsigned long nr; + nr = lock - list_entries; + WARN_ON(nr >= nr_list_entries); + return test_bit(nr, bfs_accessed); +} + +static inline struct lock_list *get_lock_parent(struct lock_list *child) +{ + return child->parent; +} + +static inline int get_lock_depth(struct lock_list *child) +{ + int depth = 0; + struct lock_list *parent; + + while ((parent = get_lock_parent(child))) { + child = parent; + depth++; + } + return depth; +} + static int __bfs(struct lock_list *source_entry, - void *data, - int (*match)(struct lock_list *entry, void *data), - struct lock_list **target_entry, - int forward) + void *data, + int (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry, + int forward) { struct lock_list *entry; struct list_head *head; @@ -1202,14 +1200,6 @@ check_noncircular(struct lock_list *root, struct lock_class *target, * without creating any illegal irq-safe -> irq-unsafe lock dependency. */ - -#define BFS_PROCESS_RET(ret) do { \ - if (ret < 0) \ - return print_bfs_bug(ret); \ - if (ret == 1) \ - return 1; \ - } while (0) - static inline int usage_match(struct lock_list *entry, void *bit) { return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit); @@ -1263,6 +1253,60 @@ find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit, return result; } +static void print_lock_class_header(struct lock_class *class, int depth) +{ + int bit; + + printk("%*s->", depth, ""); + print_lock_name(class); + printk(" ops: %lu", class->ops); + printk(" {\n"); + + for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { + if (class->usage_mask & (1 << bit)) { + int len = depth; + + len += printk("%*s %s", depth, "", usage_str[bit]); + len += printk(" at:\n"); + print_stack_trace(class->usage_traces + bit, len); + } + } + printk("%*s }\n", depth, ""); + + printk("%*s ... key at: ",depth,""); + print_ip_sym((unsigned long)class->key); +} + +/* + * printk the shortest lock dependencies from @start to @end in reverse order: + */ +static void __used +print_shortest_lock_dependencies(struct lock_list *leaf, + struct lock_list *root) +{ + struct lock_list *entry = leaf; + int depth; + + /*compute depth from generated tree by BFS*/ + depth = get_lock_depth(leaf); + + do { + print_lock_class_header(entry->class, depth); + printk("%*s ... acquired at:\n", depth, ""); + print_stack_trace(&entry->trace, 2); + printk("\n"); + + if (depth == 0 && (entry != root)) { + printk("lockdep:%s bad BFS generated tree\n", __func__); + break; + } + + entry = get_lock_parent(entry); + depth--; + } while (entry && (depth >= 0)); + + return; +} static int print_bad_irq_dependency(struct task_struct *curr, @@ -1349,12 +1393,18 @@ check_usage(struct task_struct *curr, struct held_lock *prev, this.class = hlock_class(prev); ret = find_usage_backwards(&this, bit_backwards, &target_entry); - BFS_PROCESS_RET(ret); + if (ret < 0) + return print_bfs_bug(ret); + if (ret == 1) + return ret; that.parent = NULL; that.class = hlock_class(next); ret = find_usage_forwards(&that, bit_forwards, &target_entry1); - BFS_PROCESS_RET(ret); + if (ret < 0) + return print_bfs_bug(ret); + if (ret == 1) + return ret; return print_bad_irq_dependency(curr, &this, &that, target_entry, target_entry1, @@ -2038,7 +2088,10 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, root.parent = NULL; root.class = hlock_class(this); ret = find_usage_forwards(&root, bit, &target_entry); - BFS_PROCESS_RET(ret); + if (ret < 0) + return print_bfs_bug(ret); + if (ret == 1) + return ret; return print_irq_inversion_bug(curr, &root, target_entry, this, 1, irqclass); @@ -2059,7 +2112,10 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, root.parent = NULL; root.class = hlock_class(this); ret = find_usage_backwards(&root, bit, &target_entry); - BFS_PROCESS_RET(ret); + if (ret < 0) + return print_bfs_bug(ret); + if (ret == 1) + return ret; return print_irq_inversion_bug(curr, &root, target_entry, this, 1, irqclass); diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index 6baa8807efdd..a2ee95ad1313 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -91,6 +91,8 @@ extern unsigned int nr_process_chains; extern unsigned int max_lockdep_depth; extern unsigned int max_recursion_depth; +extern unsigned int max_bfs_queue_depth; + #ifdef CONFIG_PROVE_LOCKING extern unsigned long lockdep_count_forward_deps(struct lock_class *); extern unsigned long lockdep_count_backward_deps(struct lock_class *); @@ -136,98 +138,3 @@ extern atomic_t nr_find_usage_backwards_recursions; # define debug_atomic_dec(ptr) do { } while (0) # define debug_atomic_read(ptr) 0 #endif - - -extern unsigned int max_bfs_queue_depth; -extern unsigned long nr_list_entries; -extern struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; -extern unsigned long bfs_accessed[]; - -/*For good efficiency of modular, we use power of 2*/ -#define MAX_CIRCULAR_QUE_SIZE 4096UL - -/* The circular_queue and helpers is used to implement the - * breadth-first search(BFS)algorithem, by which we can build - * the shortest path from the next lock to be acquired to the - * previous held lock if there is a circular between them. - * */ -struct circular_queue{ - unsigned long element[MAX_CIRCULAR_QUE_SIZE]; - unsigned int front, rear; -}; - -static inline void __cq_init(struct circular_queue *cq) -{ - cq->front = cq->rear = 0; - bitmap_zero(bfs_accessed, MAX_LOCKDEP_ENTRIES); -} - -static inline int __cq_empty(struct circular_queue *cq) -{ - return (cq->front == cq->rear); -} - -static inline int __cq_full(struct circular_queue *cq) -{ - return ((cq->rear + 1)&(MAX_CIRCULAR_QUE_SIZE-1)) == cq->front; -} - -static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) -{ - if (__cq_full(cq)) - return -1; - - cq->element[cq->rear] = elem; - cq->rear = (cq->rear + 1)&(MAX_CIRCULAR_QUE_SIZE-1); - return 0; -} - -static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem) -{ - if (__cq_empty(cq)) - return -1; - - *elem = cq->element[cq->front]; - cq->front = (cq->front + 1)&(MAX_CIRCULAR_QUE_SIZE-1); - return 0; -} - -static inline unsigned int __cq_get_elem_count(struct circular_queue *cq) -{ - return (cq->rear - cq->front)&(MAX_CIRCULAR_QUE_SIZE-1); -} - -static inline void mark_lock_accessed(struct lock_list *lock, - struct lock_list *parent) -{ - unsigned long nr; - nr = lock - list_entries; - WARN_ON(nr >= nr_list_entries); - lock->parent = parent; - set_bit(nr, bfs_accessed); -} - -static inline unsigned long lock_accessed(struct lock_list *lock) -{ - unsigned long nr; - nr = lock - list_entries; - WARN_ON(nr >= nr_list_entries); - return test_bit(nr, bfs_accessed); -} - -static inline struct lock_list *get_lock_parent(struct lock_list *child) -{ - return child->parent; -} - -static inline int get_lock_depth(struct lock_list *child) -{ - int depth = 0; - struct lock_list *parent; - - while ((parent = get_lock_parent(child))) { - child = parent; - depth++; - } - return depth; -} -- cgit v1.2.3 From 3885123da8335dc6b67387e5e626acbffc56f664 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 10 Jul 2009 10:04:50 +0900 Subject: swiotlb: remove unused swiotlb_alloc_boot() Nobody uses swiotlb_alloc_boot(). Signed-off-by: FUJITA Tomonori Acked-by: Becky Bruce --- arch/x86/kernel/pci-swiotlb.c | 5 ----- include/linux/swiotlb.h | 2 -- lib/swiotlb.c | 7 +------ 3 files changed, 1 insertion(+), 13 deletions(-) diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 6af96ee44200..0ac7cd524788 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -13,11 +13,6 @@ int swiotlb __read_mostly; -void * __init swiotlb_alloc_boot(size_t size, unsigned long nslabs) -{ - return alloc_bootmem_low_pages(size); -} - void *swiotlb_alloc(unsigned order, unsigned long nslabs) { return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order); diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index cb1a6631b8f4..94db70444c17 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -14,7 +14,6 @@ struct scatterlist; */ #define IO_TLB_SEGSIZE 128 - /* * log of the size of each IO TLB slab. The number of slabs is command line * controllable. @@ -24,7 +23,6 @@ struct scatterlist; extern void swiotlb_init(void); -extern void *swiotlb_alloc_boot(size_t bytes, unsigned long nslabs); extern void *swiotlb_alloc(unsigned order, unsigned long nslabs); extern dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, diff --git a/lib/swiotlb.c b/lib/swiotlb.c index bffe6d7ef9d9..9edfdd442edc 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -114,11 +114,6 @@ setup_io_tlb_npages(char *str) __setup("swiotlb=", setup_io_tlb_npages); /* make io_tlb_overflow tunable too? */ -void * __weak __init swiotlb_alloc_boot(size_t size, unsigned long nslabs) -{ - return alloc_bootmem_low_pages(size); -} - void * __weak swiotlb_alloc(unsigned order, unsigned long nslabs) { return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order); @@ -189,7 +184,7 @@ swiotlb_init_with_default_size(size_t default_size) /* * Get IO TLB memory from the low pages */ - io_tlb_start = swiotlb_alloc_boot(bytes, io_tlb_nslabs); + io_tlb_start = alloc_bootmem_low_pages(bytes); if (!io_tlb_start) panic("Cannot allocate SWIOTLB buffer"); io_tlb_end = io_tlb_start + bytes; -- cgit v1.2.3 From bb52196be37ce154ddc50b1f39496146d181cbe7 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 10 Jul 2009 10:04:51 +0900 Subject: swiotlb: remove unused swiotlb_alloc() Nobody uses swiotlb_alloc(). Signed-off-by: FUJITA Tomonori Acked-by: Becky Bruce --- arch/x86/kernel/pci-swiotlb.c | 5 ----- include/linux/swiotlb.h | 2 -- lib/swiotlb.c | 8 ++------ 3 files changed, 2 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 0ac7cd524788..ea675cfe76fe 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -13,11 +13,6 @@ int swiotlb __read_mostly; -void *swiotlb_alloc(unsigned order, unsigned long nslabs) -{ - return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order); -} - dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) { return paddr; diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 94db70444c17..6bc50944040f 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -23,8 +23,6 @@ struct scatterlist; extern void swiotlb_init(void); -extern void *swiotlb_alloc(unsigned order, unsigned long nslabs); - extern dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t address); extern phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 9edfdd442edc..3c4c21cdf43d 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -114,11 +114,6 @@ setup_io_tlb_npages(char *str) __setup("swiotlb=", setup_io_tlb_npages); /* make io_tlb_overflow tunable too? */ -void * __weak swiotlb_alloc(unsigned order, unsigned long nslabs) -{ - return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order); -} - dma_addr_t __weak swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) { return paddr; @@ -240,7 +235,8 @@ swiotlb_late_init_with_default_size(size_t default_size) bytes = io_tlb_nslabs << IO_TLB_SHIFT; while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { - io_tlb_start = swiotlb_alloc(order, io_tlb_nslabs); + io_tlb_start = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, + order); if (io_tlb_start) break; order--; -- cgit v1.2.3 From cf56e3f2e8a8d5b7bc719980869b0e7985c256f3 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 10 Jul 2009 10:04:52 +0900 Subject: swiotlb: remove swiotlb_arch_range_needs_mapping Nobody uses swiotlb_arch_range_needs_mapping(). Signed-off-by: FUJITA Tomonori Acked-by: Becky Bruce --- arch/x86/kernel/pci-swiotlb.c | 5 ----- include/linux/swiotlb.h | 2 -- lib/swiotlb.c | 15 ++------------- 3 files changed, 2 insertions(+), 20 deletions(-) diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index ea675cfe76fe..165bd7f93bb1 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -23,11 +23,6 @@ phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr) return baddr; } -int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size) -{ - return 0; -} - static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags) { diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 6bc50944040f..a977da24f17c 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -28,8 +28,6 @@ extern dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, extern phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t address); -extern int swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size); - extern void *swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags); diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 3c4c21cdf43d..dc1cd1f5369e 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -141,11 +141,6 @@ int __weak swiotlb_arch_address_needs_mapping(struct device *hwdev, return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size); } -int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size) -{ - return 0; -} - static void swiotlb_print_info(unsigned long bytes) { phys_addr_t pstart, pend; @@ -312,11 +307,6 @@ address_needs_mapping(struct device *hwdev, dma_addr_t addr, size_t size) return swiotlb_arch_address_needs_mapping(hwdev, addr, size); } -static inline int range_needs_mapping(phys_addr_t paddr, size_t size) -{ - return swiotlb_force || swiotlb_arch_range_needs_mapping(paddr, size); -} - static int is_swiotlb_buffer(char *addr) { return addr >= io_tlb_start && addr < io_tlb_end; @@ -646,8 +636,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, * we can safely return the device addr and not worry about bounce * buffering it. */ - if (!address_needs_mapping(dev, dev_addr, size) && - !range_needs_mapping(phys, size)) + if (!address_needs_mapping(dev, dev_addr, size) && !swiotlb_force) return dev_addr; /* @@ -810,7 +799,7 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, phys_addr_t paddr = sg_phys(sg); dma_addr_t dev_addr = swiotlb_phys_to_bus(hwdev, paddr); - if (range_needs_mapping(paddr, sg->length) || + if (swiotlb_force || address_needs_mapping(hwdev, dev_addr, sg->length)) { void *map = map_single(hwdev, sg_phys(sg), sg->length, dir); -- cgit v1.2.3 From 02ca646e73f3cb9be69e339841b94edae675e248 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Thu, 23 Jul 2009 11:18:49 +0900 Subject: swiotlb: remove unnecessary swiotlb_bus_to_virt swiotlb_bus_to_virt is unncessary; we can use swiotlb_bus_to_phys and phys_to_virt instead. Signed-off-by: FUJITA Tomonori Acked-by: Becky Bruce --- arch/powerpc/kernel/dma-swiotlb.c | 10 ---------- lib/swiotlb.c | 41 ++++++++++++++++++++++----------------- 2 files changed, 23 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index 68ccf11e4f19..41534ae2f589 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -24,16 +24,6 @@ int swiotlb __read_mostly; unsigned int ppc_swiotlb_enable; -void *swiotlb_bus_to_virt(struct device *hwdev, dma_addr_t addr) -{ - unsigned long pfn = PFN_DOWN(swiotlb_bus_to_phys(hwdev, addr)); - void *pageaddr = page_address(pfn_to_page(pfn)); - - if (pageaddr != NULL) - return pageaddr + (addr % PAGE_SIZE); - return NULL; -} - dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) { return paddr + get_dma_direct_offset(hwdev); diff --git a/lib/swiotlb.c b/lib/swiotlb.c index dc1cd1f5369e..cba3db809fbc 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -124,17 +124,13 @@ phys_addr_t __weak swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr) return baddr; } +/* Note that this doesn't work with highmem page */ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, volatile void *address) { return swiotlb_phys_to_bus(hwdev, virt_to_phys(address)); } -void * __weak swiotlb_bus_to_virt(struct device *hwdev, dma_addr_t address) -{ - return phys_to_virt(swiotlb_bus_to_phys(hwdev, address)); -} - int __weak swiotlb_arch_address_needs_mapping(struct device *hwdev, dma_addr_t addr, size_t size) { @@ -307,9 +303,10 @@ address_needs_mapping(struct device *hwdev, dma_addr_t addr, size_t size) return swiotlb_arch_address_needs_mapping(hwdev, addr, size); } -static int is_swiotlb_buffer(char *addr) +static int is_swiotlb_buffer(phys_addr_t paddr) { - return addr >= io_tlb_start && addr < io_tlb_end; + return paddr >= virt_to_phys(io_tlb_start) && + paddr < virt_to_phys(io_tlb_end); } /* @@ -582,11 +579,13 @@ EXPORT_SYMBOL(swiotlb_alloc_coherent); void swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, - dma_addr_t dma_handle) + dma_addr_t dev_addr) { + phys_addr_t paddr = swiotlb_bus_to_phys(hwdev, dev_addr); + WARN_ON(irqs_disabled()); - if (!is_swiotlb_buffer(vaddr)) - free_pages((unsigned long) vaddr, get_order(size)); + if (!is_swiotlb_buffer(paddr)) + free_pages((unsigned long)vaddr, get_order(size)); else /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ do_unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE); @@ -671,19 +670,25 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page); static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size, int dir) { - char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr); + phys_addr_t paddr = swiotlb_bus_to_phys(hwdev, dev_addr); BUG_ON(dir == DMA_NONE); - if (is_swiotlb_buffer(dma_addr)) { - do_unmap_single(hwdev, dma_addr, size, dir); + if (is_swiotlb_buffer(paddr)) { + do_unmap_single(hwdev, phys_to_virt(paddr), size, dir); return; } if (dir != DMA_FROM_DEVICE) return; - dma_mark_clean(dma_addr, size); + /* + * phys_to_virt doesn't work with hihgmem page but we could + * call dma_mark_clean() with hihgmem page here. However, we + * are fine since dma_mark_clean() is null on POWERPC. We can + * make dma_mark_clean() take a physical address if necessary. + */ + dma_mark_clean(phys_to_virt(paddr), size); } void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, @@ -708,19 +713,19 @@ static void swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, size_t size, int dir, int target) { - char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr); + phys_addr_t paddr = swiotlb_bus_to_phys(hwdev, dev_addr); BUG_ON(dir == DMA_NONE); - if (is_swiotlb_buffer(dma_addr)) { - sync_single(hwdev, dma_addr, size, dir, target); + if (is_swiotlb_buffer(paddr)) { + sync_single(hwdev, phys_to_virt(paddr), size, dir, target); return; } if (dir != DMA_FROM_DEVICE) return; - dma_mark_clean(dma_addr, size); + dma_mark_clean(phys_to_virt(paddr), size); } void -- cgit v1.2.3 From 99becaca86d184a4433e9fde879ff97303d7669f Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 10 Jul 2009 10:04:54 +0900 Subject: x86: add dma_capable() to replace is_buffer_dma_capable() dma_capable() eventually replaces is_buffer_dma_capable(), which tells if a memory area is dma-capable or not. The problem of is_buffer_dma_capable() is that it doesn't take a pointer to struct device so it doesn't work for POWERPC. Signed-off-by: FUJITA Tomonori --- arch/x86/include/asm/dma-mapping.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 1c3f9435f1c9..adac59c8f69b 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -55,6 +55,14 @@ extern int dma_set_mask(struct device *dev, u64 mask); extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag); +static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) +{ + if (!dev->dma_mask) + return 0; + + return addr + size <= *dev->dma_mask; +} + static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, enum dma_data_direction dir) -- cgit v1.2.3 From a4c2baa6e148adfb27beaf16b6fb6d465b5b3acb Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 10 Jul 2009 10:04:55 +0900 Subject: x86: replace is_buffer_dma_capable() with dma_capable Signed-off-by: FUJITA Tomonori --- arch/x86/kernel/pci-dma.c | 2 +- arch/x86/kernel/pci-gart_64.c | 5 ++--- arch/x86/kernel/pci-nommu.c | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 1a041bcf506b..3c945c0b3501 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -147,7 +147,7 @@ again: return NULL; addr = page_to_phys(page); - if (!is_buffer_dma_capable(dma_mask, addr, size)) { + if (addr + size > dma_mask) { __free_pages(page, get_order(size)); if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) { diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index d2e56b8f48e7..98a827ee9ed7 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -190,14 +190,13 @@ static void iommu_full(struct device *dev, size_t size, int dir) static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) { - return force_iommu || - !is_buffer_dma_capable(*dev->dma_mask, addr, size); + return force_iommu || !dma_capable(dev, addr, size); } static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) { - return !is_buffer_dma_capable(*dev->dma_mask, addr, size); + return !dma_capable(dev, addr, size); } /* Map a single continuous physical area into the IOMMU. diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 71d412a09f30..c0a4222bf62b 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -14,7 +14,7 @@ static int check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) { - if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) { + if (hwdev && !dma_capable(hwdev, bus, size)) { if (*hwdev->dma_mask >= DMA_BIT_MASK(32)) printk(KERN_ERR "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", -- cgit v1.2.3 From a0b00ca84b3ecb9eebd62ad34880d8cc0d988c8a Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 10 Jul 2009 10:04:56 +0900 Subject: ia64: add dma_capable() to replace is_buffer_dma_capable() dma_capable() eventually replaces is_buffer_dma_capable(), which tells if a memory area is dma-capable or not. The problem of is_buffer_dma_capable() is that it doesn't take a pointer to struct device so it doesn't work for POWERPC. Signed-off-by: FUJITA Tomonori --- arch/ia64/include/asm/dma-mapping.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h index 5a61b5c2e18f..88d0f860394d 100644 --- a/arch/ia64/include/asm/dma-mapping.h +++ b/arch/ia64/include/asm/dma-mapping.h @@ -69,6 +69,14 @@ dma_set_mask (struct device *dev, u64 mask) return 0; } +static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) +{ + if (!dev->dma_mask) + return 0; + + return addr + size <= *dev->dma_mask; +} + extern int dma_get_cache_alignment(void); static inline void -- cgit v1.2.3 From 9a937c91eea31c4b594ea49a2a23c57003e04987 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 10 Jul 2009 10:04:57 +0900 Subject: powerpc: add dma_capable() to replace is_buffer_dma_capable() dma_capable() eventually replaces is_buffer_dma_capable(), which tells if a memory area is dma-capable or not. The problem of is_buffer_dma_capable() is that it doesn't take a pointer to struct device so it doesn't work for POWERPC. Signed-off-by: FUJITA Tomonori Acked-by: Becky Bruce --- arch/powerpc/include/asm/dma-mapping.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index b44aaabdd1a6..6ff1f8581d79 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -424,6 +424,19 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) #endif } +static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) +{ + struct dma_mapping_ops *ops = get_dma_ops(dev); + + if (ops->addr_needs_map && ops->addr_needs_map(dev, addr, size)) + return 0; + + if (!dev->dma_mask) + return 0; + + return addr + size <= *dev->dma_mask; +} + #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) #ifdef CONFIG_NOT_COHERENT_CACHE -- cgit v1.2.3 From b9394647ac88faad9db0f9e92eac4db434faded6 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 10 Jul 2009 10:04:58 +0900 Subject: swiotlb: use dma_capable() This converts swiotlb to use dma_capable() instead of swiotlb_arch_address_needs_mapping() and is_buffer_dma_capable(). Signed-off-by: FUJITA Tomonori Acked-by: Becky Bruce --- lib/swiotlb.c | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/lib/swiotlb.c b/lib/swiotlb.c index cba3db809fbc..a012d93792b7 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -131,12 +131,6 @@ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, return swiotlb_phys_to_bus(hwdev, virt_to_phys(address)); } -int __weak swiotlb_arch_address_needs_mapping(struct device *hwdev, - dma_addr_t addr, size_t size) -{ - return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size); -} - static void swiotlb_print_info(unsigned long bytes) { phys_addr_t pstart, pend; @@ -297,12 +291,6 @@ cleanup1: return -ENOMEM; } -static inline int -address_needs_mapping(struct device *hwdev, dma_addr_t addr, size_t size) -{ - return swiotlb_arch_address_needs_mapping(hwdev, addr, size); -} - static int is_swiotlb_buffer(phys_addr_t paddr) { return paddr >= virt_to_phys(io_tlb_start) && @@ -539,9 +527,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_mask = hwdev->coherent_dma_mask; ret = (void *)__get_free_pages(flags, order); - if (ret && - !is_buffer_dma_capable(dma_mask, swiotlb_virt_to_bus(hwdev, ret), - size)) { + if (ret && swiotlb_virt_to_bus(hwdev, ret) + size > dma_mask) { /* * The allocated memory isn't reachable by the device. */ @@ -563,7 +549,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, dev_addr = swiotlb_virt_to_bus(hwdev, ret); /* Confirm address can be DMA'd by device */ - if (!is_buffer_dma_capable(dma_mask, dev_addr, size)) { + if (dev_addr + size > dma_mask) { printk("hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n", (unsigned long long)dma_mask, (unsigned long long)dev_addr); @@ -635,7 +621,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, * we can safely return the device addr and not worry about bounce * buffering it. */ - if (!address_needs_mapping(dev, dev_addr, size) && !swiotlb_force) + if (dma_capable(dev, dev_addr, size) && !swiotlb_force) return dev_addr; /* @@ -652,7 +638,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, /* * Ensure that the address returned is DMA'ble */ - if (address_needs_mapping(dev, dev_addr, size)) + if (!dma_capable(dev, dev_addr, size)) panic("map_single: bounce buffer is not DMA'ble"); return dev_addr; @@ -805,7 +791,7 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, dma_addr_t dev_addr = swiotlb_phys_to_bus(hwdev, paddr); if (swiotlb_force || - address_needs_mapping(hwdev, dev_addr, sg->length)) { + !dma_capable(hwdev, dev_addr, sg->length)) { void *map = map_single(hwdev, sg_phys(sg), sg->length, dir); if (!map) { -- cgit v1.2.3 From 30945fdf6a08f52370dab3bc162e96c4b4e36082 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 10 Jul 2009 10:04:59 +0900 Subject: powerpc: remove unncesary swiotlb_arch_address_needs_mapping swiotlb doesn't use swiotlb_arch_address_needs_mapping(); it uses dma_capalbe(). We can remove unnecessary swiotlb_arch_address_needs_mapping(). We can remove swiotlb_addr_needs_map() and is_buffer_dma_capable() in swiotlb_pci_addr_needs_map() too; dma_capable() handles the features that both provide. Signed-off-by: FUJITA Tomonori Acked-by: Becky Bruce --- arch/powerpc/kernel/dma-swiotlb.c | 27 +-------------------------- 1 file changed, 1 insertion(+), 26 deletions(-) diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index 41534ae2f589..a3bbe02cda98 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -35,29 +35,12 @@ phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr) return baddr - get_dma_direct_offset(hwdev); } -/* - * Determine if an address needs bounce buffering via swiotlb. - * Going forward I expect the swiotlb code to generalize on using - * a dma_ops->addr_needs_map, and this function will move from here to the - * generic swiotlb code. - */ -int -swiotlb_arch_address_needs_mapping(struct device *hwdev, dma_addr_t addr, - size_t size) -{ - struct dma_mapping_ops *dma_ops = get_dma_ops(hwdev); - - BUG_ON(!dma_ops); - return dma_ops->addr_needs_map(hwdev, addr, size); -} - /* * Determine if an address is reachable by a pci device, or if we must bounce. */ static int swiotlb_pci_addr_needs_map(struct device *hwdev, dma_addr_t addr, size_t size) { - u64 mask = dma_get_mask(hwdev); dma_addr_t max; struct pci_controller *hose; struct pci_dev *pdev = to_pci_dev(hwdev); @@ -69,16 +52,9 @@ swiotlb_pci_addr_needs_map(struct device *hwdev, dma_addr_t addr, size_t size) if ((addr + size > max) | (addr < hose->dma_window_base_cur)) return 1; - return !is_buffer_dma_capable(mask, addr, size); -} - -static int -swiotlb_addr_needs_map(struct device *hwdev, dma_addr_t addr, size_t size) -{ - return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size); + return 0; } - /* * At the moment, all platforms that use this code only require * swiotlb to be used if we're operating on HIGHMEM. Since @@ -94,7 +70,6 @@ struct dma_mapping_ops swiotlb_dma_ops = { .dma_supported = swiotlb_dma_supported, .map_page = swiotlb_map_page, .unmap_page = swiotlb_unmap_page, - .addr_needs_map = swiotlb_addr_needs_map, .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu, .sync_single_range_for_device = swiotlb_sync_single_range_for_device, .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, -- cgit v1.2.3 From 8f2502fd8157632909ff335a3c628e7caeec5e03 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 10 Jul 2009 10:05:00 +0900 Subject: remove is_buffer_dma_capable() is_buffer_dma_capable() was replaced with dma_capable(). is_buffer_dma_capable() tells if a buffer is dma-capable or not. However, it doesn't take a pointer to struct device so it doesn't work for POWERPC. Signed-off-by: FUJITA Tomonori Acked-by: Becky Bruce --- include/linux/dma-mapping.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 07dfd460d286..c0f6c3cd788c 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -98,11 +98,6 @@ static inline int is_device_dma_capable(struct device *dev) return dev->dma_mask != NULL && *dev->dma_mask != DMA_MASK_NONE; } -static inline int is_buffer_dma_capable(u64 mask, dma_addr_t addr, size_t size) -{ - return addr + size <= mask; -} - #ifdef CONFIG_HAS_DMA #include #else -- cgit v1.2.3 From 8d4f5339d1ee4027c07e6b2a1cfa9dc41b0d383b Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 10 Jul 2009 10:05:01 +0900 Subject: x86, IA64, powerpc: add phys_to_dma() and dma_to_phys() This adds two functions, phys_to_dma() and dma_to_phys() to x86, IA64 and powerpc. swiotlb uses them. phys_to_dma() converts a physical address to a dma address. dma_to_phys() does the opposite. Signed-off-by: FUJITA Tomonori Acked-by: Becky Bruce --- arch/ia64/include/asm/dma-mapping.h | 10 ++++++++++ arch/powerpc/include/asm/dma-mapping.h | 10 ++++++++++ arch/x86/include/asm/dma-mapping.h | 10 ++++++++++ 3 files changed, 30 insertions(+) diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h index 88d0f860394d..f91829de329f 100644 --- a/arch/ia64/include/asm/dma-mapping.h +++ b/arch/ia64/include/asm/dma-mapping.h @@ -77,6 +77,16 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) return addr + size <= *dev->dma_mask; } +static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) +{ + return paddr; +} + +static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) +{ + return daddr; +} + extern int dma_get_cache_alignment(void); static inline void diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index 6ff1f8581d79..0c34371ec49c 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -437,6 +437,16 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) return addr + size <= *dev->dma_mask; } +static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) +{ + return paddr + get_dma_direct_offset(dev); +} + +static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) +{ + return daddr - get_dma_direct_offset(dev); +} + #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) #ifdef CONFIG_NOT_COHERENT_CACHE diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index adac59c8f69b..0ee770d23d0e 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -63,6 +63,16 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) return addr + size <= *dev->dma_mask; } +static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) +{ + return paddr; +} + +static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) +{ + return daddr; +} + static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, enum dma_data_direction dir) -- cgit v1.2.3 From 862d196b27bd30a5ab8109d5cdb37980d81b1fe9 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 10 Jul 2009 10:05:02 +0900 Subject: swiotlb: use phys_to_dma and dma_to_phys This converts swiotlb to use phys_to_dma and dma_to_phys instead of swiotlb_phys_to_bus() and swiotlb_bus_to_phys(). swiotlb_phys_to_bus() and swiotlb_bus_to_phys() are not necessary so this patch also removes them. Signed-off-by: FUJITA Tomonori Acked-by: Becky Bruce --- include/linux/swiotlb.h | 5 ----- lib/swiotlb.c | 22 ++++++---------------- 2 files changed, 6 insertions(+), 21 deletions(-) diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index a977da24f17c..73b1f1cec423 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -23,11 +23,6 @@ struct scatterlist; extern void swiotlb_init(void); -extern dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, - phys_addr_t address); -extern phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, - dma_addr_t address); - extern void *swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags); diff --git a/lib/swiotlb.c b/lib/swiotlb.c index a012d93792b7..9e2fe3e1804c 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -114,21 +114,11 @@ setup_io_tlb_npages(char *str) __setup("swiotlb=", setup_io_tlb_npages); /* make io_tlb_overflow tunable too? */ -dma_addr_t __weak swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) -{ - return paddr; -} - -phys_addr_t __weak swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr) -{ - return baddr; -} - /* Note that this doesn't work with highmem page */ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, volatile void *address) { - return swiotlb_phys_to_bus(hwdev, virt_to_phys(address)); + return phys_to_dma(hwdev, virt_to_phys(address)); } static void swiotlb_print_info(unsigned long bytes) @@ -567,7 +557,7 @@ void swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, dma_addr_t dev_addr) { - phys_addr_t paddr = swiotlb_bus_to_phys(hwdev, dev_addr); + phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); WARN_ON(irqs_disabled()); if (!is_swiotlb_buffer(paddr)) @@ -612,7 +602,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, struct dma_attrs *attrs) { phys_addr_t phys = page_to_phys(page) + offset; - dma_addr_t dev_addr = swiotlb_phys_to_bus(dev, phys); + dma_addr_t dev_addr = phys_to_dma(dev, phys); void *map; BUG_ON(dir == DMA_NONE); @@ -656,7 +646,7 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page); static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size, int dir) { - phys_addr_t paddr = swiotlb_bus_to_phys(hwdev, dev_addr); + phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); BUG_ON(dir == DMA_NONE); @@ -699,7 +689,7 @@ static void swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, size_t size, int dir, int target) { - phys_addr_t paddr = swiotlb_bus_to_phys(hwdev, dev_addr); + phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); BUG_ON(dir == DMA_NONE); @@ -788,7 +778,7 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, for_each_sg(sgl, sg, nelems, i) { phys_addr_t paddr = sg_phys(sg); - dma_addr_t dev_addr = swiotlb_phys_to_bus(hwdev, paddr); + dma_addr_t dev_addr = phys_to_dma(hwdev, paddr); if (swiotlb_force || !dma_capable(hwdev, dev_addr, sg->length)) { -- cgit v1.2.3 From 8ab7ff42c9dd9231706a4ba7120eed22ff52a93b Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 10 Jul 2009 10:05:03 +0900 Subject: powerpc: remove unused swiotlb_phys_to_bus() and swiotlb_bus_to_phys() phys_to_dma() and dma_to_phys() are used instead of swiotlb_phys_to_bus() and swiotlb_bus_to_phys(). Signed-off-by: FUJITA Tomonori Acked-by: Becky Bruce --- arch/powerpc/kernel/dma-swiotlb.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index a3bbe02cda98..e8a57de85bcf 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -24,17 +24,6 @@ int swiotlb __read_mostly; unsigned int ppc_swiotlb_enable; -dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) -{ - return paddr + get_dma_direct_offset(hwdev); -} - -phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr) - -{ - return baddr - get_dma_direct_offset(hwdev); -} - /* * Determine if an address is reachable by a pci device, or if we must bounce. */ -- cgit v1.2.3 From b683d42693c4e92b838117f5c6f7b90bfa1525c9 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 10 Jul 2009 10:05:04 +0900 Subject: x86: remove unused swiotlb_phys_to_bus() and swiotlb_bus_to_phys() phys_to_dma() and dma_to_phys() are used instead of swiotlb_phys_to_bus() and swiotlb_bus_to_phys(). Signed-off-by: FUJITA Tomonori --- arch/x86/kernel/pci-swiotlb.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 165bd7f93bb1..e8a35016115f 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -13,16 +13,6 @@ int swiotlb __read_mostly; -dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) -{ - return paddr; -} - -phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr) -{ - return baddr; -} - static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags) { -- cgit v1.2.3 From 94699b04eddd4b247d871930431d6fa1a46c175e Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 28 Jul 2009 23:52:54 +0200 Subject: x86, mce: don't log boot MCEs on Pentium M (model == 13) CPUs On my legacy Pentium M laptop (Acer Extensa 2900) I get bogus MCE on a cold boot with CONFIG_X86_NEW_MCE enabled, i.e. (after decoding it with mcelog): MCE 0 HARDWARE ERROR. This is *NOT* a software problem! Please contact your hardware vendor CPU 0 BANK 1 MCG status: MCi status: Error overflow Uncorrected error Error enabled Processor context corrupt MCA: Data CACHE Level-1 UNKNOWN Error STATUS f200000000000195 MCGSTATUS 0 [ The other STATUS values observed: f2000000000001b5 (... UNKNOWN error) and f200000000000115 (... READ Error). To verify that this is not a CONFIG_X86_NEW_MCE bug I also modified the CONFIG_X86_OLD_MCE code (which doesn't log any MCEs) to dump content of STATUS MSR before it is cleared during initialization. ] Since the bogus MCE results in a kernel taint (which in turn disables lockdep support) don't log boot MCEs on Pentium M (model == 13) CPUs by default ("mce=bootlog" boot parameter can be be used to get the old behavior). Signed-off-by: Bartlomiej Zolnierkiewicz Reviewed-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 07139a0578e3..7bd19c7f5315 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1269,6 +1269,10 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && monarch_timeout < 0) monarch_timeout = USEC_PER_SEC; + + /* There are also broken BIOSes on some Pentium M systems. */ + if (c->x86 == 6 && c->x86_model == 13 && mce_bootlog < 0) + mce_bootlog = 0; } if (monarch_timeout < 0) monarch_timeout = 0; -- cgit v1.2.3 From e3346fc48204d780f92527d06df8bf6f28d603ec Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 28 Jul 2009 23:55:09 +0200 Subject: x86, mce: fix "mce" boot option handling for CONFIG_X86_NEW_MCE "mce argument mce ignored. Please use /sys" message shouldn't be printed when using "mce" boot option. Signed-off-by: Bartlomiej Zolnierkiewicz Reviewed-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 7bd19c7f5315..75919440a188 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1549,8 +1549,10 @@ static struct miscdevice mce_log_device = { */ static int __init mcheck_enable(char *str) { - if (*str == 0) + if (*str == 0) { enable_p5_mce(); + return 1; + } if (*str == '=') str++; if (!strcmp(str, "off")) -- cgit v1.2.3 From 419d6162c0c0103fa2f44f6691dff9cac14c650d Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 28 Jul 2009 23:56:00 +0200 Subject: x86, mce: add missing __cpuinit tags mce_cap_init() and mce_cpu_quirks() can be tagged with __cpuinit. Signed-off-by: Bartlomiej Zolnierkiewicz Reviewed-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 75919440a188..1ce6db1f8789 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1158,7 +1158,7 @@ static int mce_banks_init(void) /* * Initialize Machine Checks for a CPU. */ -static int mce_cap_init(void) +static int __cpuinit mce_cap_init(void) { unsigned b; u64 cap; @@ -1222,7 +1222,7 @@ static void mce_init(void) } /* Add per CPU specific workarounds here */ -static void mce_cpu_quirks(struct cpuinfo_x86 *c) +static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) { /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { -- cgit v1.2.3 From d0c87d1f61704ed589fc0788bedd753632340e98 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 28 Jul 2009 23:56:37 +0200 Subject: x86, mce: remove never executed code fseverities_coverage is never NULL in err_out code path. Signed-off-by: Bartlomiej Zolnierkiewicz Reviewed-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index ff0807f97056..51f7c725dab5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -209,8 +209,6 @@ static int __init severities_debugfs_init(void) return 0; err_out: - if (fseverities_coverage) - debugfs_remove(fseverities_coverage); if (dmce) debugfs_remove(dmce); return -ENOMEM; -- cgit v1.2.3 From f3a0867b12e0cf1512c0bd0665f2339fc75ed2a8 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Wed, 29 Jul 2009 00:04:59 +0200 Subject: x86, mce: fix reporting of Thermal Monitoring mechanism enabled Early Pentium M models use different method for enabling TM2 (per paragraph 13.5.2.3 of the "Intel 64 and IA-32 Architectures Software Developer's Manual Volume 3A: System Programming Guide, Part 1"). Tested on the affected Pentium M variant (model == 13). Signed-off-by: Bartlomiej Zolnierkiewicz Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr-index.h | 4 ++++ arch/x86/kernel/cpu/mcheck/therm_throt.c | 13 ++++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 3d1ce094586a..cbec06deb68b 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -222,6 +222,10 @@ #define THERM_STATUS_PROCHOT (1 << 0) +#define MSR_THERM2_CTL 0x0000019d + +#define MSR_THERM2_CTL_TM_SELECT (1ULL << 16) + #define MSR_IA32_MISC_ENABLE 0x000001a0 /* MISC_ENABLE bits: architectural */ diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index bff8dd191dd5..15f2bc07bb60 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -253,9 +253,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c) return; } - if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) - tm2 = 1; - /* Check whether a vector already exists */ if (h & APIC_VECTOR_MASK) { printk(KERN_DEBUG @@ -264,6 +261,16 @@ void intel_init_thermal(struct cpuinfo_x86 *c) return; } + /* early Pentium M models use different method for enabling TM2 */ + if (cpu_has(c, X86_FEATURE_TM2)) { + if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) { + rdmsr(MSR_THERM2_CTL, l, h); + if (l & MSR_THERM2_CTL_TM_SELECT) + tm2 = 1; + } else if (l & MSR_IA32_MISC_ENABLE_TM2) + tm2 = 1; + } + /* We'll mask the thermal vector in the lapic till we're ready: */ h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; apic_write(APIC_LVTTHMR, h); -- cgit v1.2.3 From c1dc0b9c0c8979ce4d411caadff5c0d79dee58bc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 2 Aug 2009 11:28:21 +0200 Subject: debug lockups: Improve lockup detection When debugging a recent lockup bug i found various deficiencies in how our current lockup detection helpers work: - SysRq-L is not very efficient as it uses a workqueue, hence it cannot punch through hard lockups and cannot see through most soft lockups either. - The SysRq-L code depends on the NMI watchdog - which is off by default. - We dont print backtraces from the RCU code's built-in 'RCU state machine is stuck' debug code. This debug code tends to be one of the first (and only) mechanisms that show that a lockup has occured. This patch changes the code so taht we: - Trigger the NMI backtrace code from SysRq-L instead of using a workqueue (which cannot punch through hard lockups) - Trigger print-all-CPU-backtraces from the RCU lockup detection code Also decouple the backtrace printing code from the NMI watchdog: - Dont use variable size cpumasks (it might not be initialized and they are a bit more fragile anyway) - Trigger an NMI immediately via an IPI, instead of waiting for the NMI tick to occur. This is a lot faster and can produce more relevant backtraces. It will also work if the NMI watchdog is disabled. - Dont print the 'dazed and confused' message when we print a backtrace from the NMI - Do a show_regs() plus a dump_stack() to get maximum info out of the dump. Worst-case we get two stacktraces - which is not a big deal. Sometimes, if register content is corrupted, the precise stack walker in show_regs() wont give us a full backtrace - in this case dump_stack() will do it. Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Andrew Morton Cc: Linus Torvalds LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/nmi.c | 18 ++++++++++++------ drivers/char/sysrq.c | 8 ++------ kernel/rcutree.c | 7 ++++++- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index b3025b43b63a..1bb1ac20e9ec 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -39,7 +39,7 @@ int unknown_nmi_panic; int nmi_watchdog_enabled; -static cpumask_var_t backtrace_mask; +static cpumask_t backtrace_mask __read_mostly; /* nmi_active: * >0: the lapic NMI watchdog is active, but can be disabled @@ -138,7 +138,6 @@ int __init check_nmi_watchdog(void) if (!prev_nmi_count) goto error; - alloc_cpumask_var(&backtrace_mask, GFP_KERNEL|__GFP_ZERO); printk(KERN_INFO "Testing NMI watchdog ... "); #ifdef CONFIG_SMP @@ -415,14 +414,17 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) } /* We can be called before check_nmi_watchdog, hence NULL check. */ - if (backtrace_mask != NULL && cpumask_test_cpu(cpu, backtrace_mask)) { + if (cpumask_test_cpu(cpu, &backtrace_mask)) { static DEFINE_SPINLOCK(lock); /* Serialise the printks */ spin_lock(&lock); printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); + show_regs(regs); dump_stack(); spin_unlock(&lock); - cpumask_clear_cpu(cpu, backtrace_mask); + cpumask_clear_cpu(cpu, &backtrace_mask); + + rc = 1; } /* Could check oops_in_progress here too, but it's safer not to */ @@ -556,10 +558,14 @@ void __trigger_all_cpu_backtrace(void) { int i; - cpumask_copy(backtrace_mask, cpu_online_mask); + cpumask_copy(&backtrace_mask, cpu_online_mask); + + printk(KERN_INFO "sending NMI to all CPUs:\n"); + apic->send_IPI_all(NMI_VECTOR); + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ for (i = 0; i < 10 * 1000; i++) { - if (cpumask_empty(backtrace_mask)) + if (cpumask_empty(&backtrace_mask)) break; mdelay(1); } diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 5d7a02f63e1c..165f307f30e8 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -222,12 +223,7 @@ static DECLARE_WORK(sysrq_showallcpus, sysrq_showregs_othercpus); static void sysrq_handle_showallcpus(int key, struct tty_struct *tty) { - struct pt_regs *regs = get_irq_regs(); - if (regs) { - printk(KERN_INFO "CPU%d:\n", smp_processor_id()); - show_regs(regs); - } - schedule_work(&sysrq_showallcpus); + trigger_all_cpu_backtrace(); } static struct sysrq_key_op sysrq_showallcpus_op = { diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 7717b95c2027..9c5fa9fc57ec 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -469,6 +470,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp) } printk(" (detected by %d, t=%ld jiffies)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start)); + trigger_all_cpu_backtrace(); + force_quiescent_state(rsp, 0); /* Kick them all. */ } @@ -479,12 +482,14 @@ static void print_cpu_stall(struct rcu_state *rsp) printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n", smp_processor_id(), jiffies - rsp->gp_start); - dump_stack(); + trigger_all_cpu_backtrace(); + spin_lock_irqsave(&rnp->lock, flags); if ((long)(jiffies - rsp->jiffies_stall) >= 0) rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; spin_unlock_irqrestore(&rnp->lock, flags); + set_need_resched(); /* kick ourselves to get things going. */ } -- cgit v1.2.3 From 716a42348cdaf04534b15fbdc9c83e25baebfed5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 24 Jul 2009 20:05:23 +0200 Subject: sched: Fix cond_resched_lock() in !CONFIG_PREEMPT The might_sleep() test inside cond_resched_lock() assumes the spinlock is held and then preemption is disabled. This is true with CONFIG_PREEMPT but the preempt_count() doesn't change otherwise. Check by starting from the appropriate preempt offset depending on the config. Reported-by: Li Zefan Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1248458723-12146-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index cbbfca69aa4a..c472414953bf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2293,8 +2293,14 @@ extern int _cond_resched(void); extern int __cond_resched_lock(spinlock_t *lock); +#ifdef CONFIG_PREEMPT +#define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET +#else +#define PREEMPT_LOCK_OFFSET 0 +#endif + #define cond_resched_lock(lock) ({ \ - __might_sleep(__FILE__, __LINE__, PREEMPT_OFFSET); \ + __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ __cond_resched_lock(lock); \ }) -- cgit v1.2.3 From a5004278f0525dcb9aa43703ef77bf371ea837cd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 27 Jul 2009 14:04:49 +0200 Subject: sched: Fix cgroup smp fairness Commit ec4e0e2fe018992d980910db901637c814575914 ("fix inconsistency when redistribute per-cpu tg->cfs_rq shares") broke cgroup smp fairness. In order to avoid starvation of newly placed tasks, we never quite set the share of an empty cpu group-task to 0, but instead we set it as if there's a single NICE-0 task present. If however we actually set this in cfs_rq[cpu]->shares, that means the total shares for that group will be slightly inflated every time we balance, causing the observed unfairness. Fix this by setting cfs_rq[cpu]->shares to 0 but actually setting the effective weight of the related se to the inflated number. Signed-off-by: Peter Zijlstra LKML-Reference: <1248696557.6987.1615.camel@twins> Signed-off-by: Ingo Molnar --- kernel/sched.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index ce1056e9b02a..26976cd8be0f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1523,13 +1523,18 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long sd_shares, unsigned long sd_rq_weight) { - unsigned long shares; unsigned long rq_weight; + unsigned long shares; + int boost = 0; if (!tg->se[cpu]) return; rq_weight = tg->cfs_rq[cpu]->rq_weight; + if (!rq_weight) { + boost = 1; + rq_weight = NICE_0_LOAD; + } /* * \Sum shares * rq_weight @@ -1546,8 +1551,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long flags; spin_lock_irqsave(&rq->lock, flags); - tg->cfs_rq[cpu]->shares = shares; - + tg->cfs_rq[cpu]->shares = boost ? 0 : shares; __set_se_shares(tg->se[cpu], shares); spin_unlock_irqrestore(&rq->lock, flags); } @@ -1560,7 +1564,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu, */ static int tg_shares_up(struct task_group *tg, void *data) { - unsigned long weight, rq_weight = 0; + unsigned long weight, rq_weight = 0, eff_weight = 0; unsigned long shares = 0; struct sched_domain *sd = data; int i; @@ -1572,11 +1576,13 @@ static int tg_shares_up(struct task_group *tg, void *data) * run here it will not get delayed by group starvation. */ weight = tg->cfs_rq[i]->load.weight; + tg->cfs_rq[i]->rq_weight = weight; + rq_weight += weight; + if (!weight) weight = NICE_0_LOAD; - tg->cfs_rq[i]->rq_weight = weight; - rq_weight += weight; + eff_weight += weight; shares += tg->cfs_rq[i]->shares; } @@ -1586,8 +1592,14 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) shares = tg->shares; - for_each_cpu(i, sched_domain_span(sd)) - update_group_shares_cpu(tg, i, shares, rq_weight); + for_each_cpu(i, sched_domain_span(sd)) { + unsigned long sd_rq_weight = rq_weight; + + if (!tg->cfs_rq[i]->rq_weight) + sd_rq_weight = eff_weight; + + update_group_shares_cpu(tg, i, shares, sd_rq_weight); + } return 0; } -- cgit v1.2.3 From e709715915d69b6a929d77e7652c9c3fea61c317 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Jun 2009 15:41:20 +0200 Subject: sched: Optimize unused cgroup configuration When cgroup group scheduling is built in, skip some code paths if we don't have any (but the root) cgroups configured. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 26976cd8be0f..ca1f76ba7773 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1629,8 +1629,14 @@ static int tg_load_down(struct task_group *tg, void *data) static void update_shares(struct sched_domain *sd) { - u64 now = cpu_clock(raw_smp_processor_id()); - s64 elapsed = now - sd->last_update; + s64 elapsed; + u64 now; + + if (root_task_group_empty()) + return; + + now = cpu_clock(raw_smp_processor_id()); + elapsed = now - sd->last_update; if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { sd->last_update = now; @@ -1640,6 +1646,9 @@ static void update_shares(struct sched_domain *sd) static void update_shares_locked(struct rq *rq, struct sched_domain *sd) { + if (root_task_group_empty()) + return; + spin_unlock(&rq->lock); update_shares(sd); spin_lock(&rq->lock); @@ -1647,6 +1656,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) static void update_h_load(long cpu) { + if (root_task_group_empty()) + return; + walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } -- cgit v1.2.3 From da19ab510343c6496fe8b8f890091296032025c9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Jul 2009 00:21:22 -0400 Subject: sched: Check for pushing rt tasks after all scheduling The current method for pushing RT tasks after scheduling only happens after a context switch. But we found cases where a task is set up on a run queue to be pushed but the push never happens because the schedule chooses the same task. This bug was found with the help of Gregory Haskins and the use of ftrace (trace_printk). It tooks several days for both of us analyzing the code and the trace output to find this. Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra LKML-Reference: <20090729042526.205923666@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/sched.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index ca1f76ba7773..a030d4514cdc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2839,14 +2839,14 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, * with the lock held can cause deadlocks; see schedule() for * details.) */ -static void finish_task_switch(struct rq *rq, struct task_struct *prev) +static int finish_task_switch(struct rq *rq, struct task_struct *prev) __releases(rq->lock) { struct mm_struct *mm = rq->prev_mm; long prev_state; -#ifdef CONFIG_SMP int post_schedule = 0; +#ifdef CONFIG_SMP if (current->sched_class->needs_post_schedule) post_schedule = current->sched_class->needs_post_schedule(rq); #endif @@ -2868,10 +2868,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) finish_arch_switch(prev); perf_counter_task_sched_in(current, cpu_of(rq)); finish_lock_switch(rq, prev); -#ifdef CONFIG_SMP - if (post_schedule) - current->sched_class->post_schedule(rq); -#endif fire_sched_in_preempt_notifiers(current); if (mm) @@ -2884,6 +2880,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) kprobe_flush_task(prev); put_task_struct(prev); } + + return post_schedule; } /** @@ -2894,8 +2892,15 @@ asmlinkage void schedule_tail(struct task_struct *prev) __releases(rq->lock) { struct rq *rq = this_rq(); + int post_schedule; + + post_schedule = finish_task_switch(rq, prev); + +#ifdef CONFIG_SMP + if (post_schedule) + current->sched_class->post_schedule(rq); +#endif - finish_task_switch(rq, prev); #ifdef __ARCH_WANT_UNLOCKED_CTXSW /* In this case, finish_task_switch does not reenable preemption */ preempt_enable(); @@ -2908,7 +2913,7 @@ asmlinkage void schedule_tail(struct task_struct *prev) * context_switch - switch to the new MM and the new * thread's register state. */ -static inline void +static inline int context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { @@ -2955,7 +2960,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * CPUs since it called schedule(), thus the 'rq' on its stack * frame will be invalid. */ - finish_task_switch(this_rq(), prev); + return finish_task_switch(this_rq(), prev); } /* @@ -5366,6 +5371,7 @@ asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; + int post_schedule = 0; struct rq *rq; int cpu; @@ -5416,15 +5422,25 @@ need_resched_nonpreemptible: rq->curr = next; ++*switch_count; - context_switch(rq, prev, next); /* unlocks the rq */ + post_schedule = context_switch(rq, prev, next); /* unlocks the rq */ /* * the context switch might have flipped the stack from under * us, hence refresh the local variables. */ cpu = smp_processor_id(); rq = cpu_rq(cpu); - } else + } else { +#ifdef CONFIG_SMP + if (current->sched_class->needs_post_schedule) + post_schedule = current->sched_class->needs_post_schedule(rq); +#endif spin_unlock_irq(&rq->lock); + } + +#ifdef CONFIG_SMP + if (post_schedule) + current->sched_class->post_schedule(rq); +#endif if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; -- cgit v1.2.3 From c3a2ae3d93c0f10d29c071f599764d00b8de00cb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Jul 2009 00:21:23 -0400 Subject: sched: Add new prio to cpupri before removing old prio We need to add the new prio to the cpupri accounting before removing the old prio. This is because removing the old prio first will open a race window where the cpu will be removed from pri_active. In this case the cpu will not be visible for RT push and pulls. This could cause a RT task to not migrate appropriately, and create a very large latency. This bug was found with the use of ftrace sched events and trace_printk. Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra LKML-Reference: <20090729042526.438281019@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/sched_cpupri.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index d014efbf947a..0f052fc674d5 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -127,21 +127,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) /* * If the cpu was currently mapped to a different value, we - * first need to unmap the old value + * need to map it to the new value then remove the old value. + * Note, we must add the new value first, otherwise we risk the + * cpu being cleared from pri_active, and this cpu could be + * missed for a push or pull. */ - if (likely(oldpri != CPUPRI_INVALID)) { - struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; - - spin_lock_irqsave(&vec->lock, flags); - - vec->count--; - if (!vec->count) - clear_bit(oldpri, cp->pri_active); - cpumask_clear_cpu(cpu, vec->mask); - - spin_unlock_irqrestore(&vec->lock, flags); - } - if (likely(newpri != CPUPRI_INVALID)) { struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; @@ -154,6 +144,18 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) spin_unlock_irqrestore(&vec->lock, flags); } + if (likely(oldpri != CPUPRI_INVALID)) { + struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; + + spin_lock_irqsave(&vec->lock, flags); + + vec->count--; + if (!vec->count) + clear_bit(oldpri, cp->pri_active); + cpumask_clear_cpu(cpu, vec->mask); + + spin_unlock_irqrestore(&vec->lock, flags); + } *currpri = newpri; } -- cgit v1.2.3 From 3f029d3c6d62068d59301d90c18dbde8ee402107 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Wed, 29 Jul 2009 11:08:47 -0400 Subject: sched: Enhance the pre/post scheduling logic We currently have an explicit "needs_post" vtable method which returns a stack variable for whether we should later run post-schedule. This leads to an awkward exchange of the variable as it bubbles back up out of the context switch. Peter Zijlstra observed that this information could be stored in the run-queue itself instead of handled on the stack. Therefore, we revert to the method of having context_switch return void, and update an internal rq->post_schedule variable when we require further processing. In addition, we fix a race condition where we try to access current->sched_class without holding the rq->lock. This is technically racy, as the sched-class could change out from under us. Instead, we reference the per-rq post_schedule variable with the runqueue unlocked, but with preemption disabled to see if we need to reacquire the rq->lock. Finally, we clean the code up slightly by removing the #ifdef CONFIG_SMP conditionals from the schedule() call, and implement some inline helper functions instead. This patch passes checkpatch, and rt-migrate. Signed-off-by: Gregory Haskins Signed-off-by: Peter Zijlstra LKML-Reference: <20090729150422.17691.55590.stgit@dev.haskins.net> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - kernel/sched.c | 82 +++++++++++++++++++++++++++++++-------------------- kernel/sched_rt.c | 31 +++++++------------ 3 files changed, 61 insertions(+), 53 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 2c35bc29d2a9..195d72d5c102 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1047,7 +1047,6 @@ struct sched_class { struct rq *busiest, struct sched_domain *sd, enum cpu_idle_type idle); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); - int (*needs_post_schedule) (struct rq *this_rq); void (*post_schedule) (struct rq *this_rq); void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); diff --git a/kernel/sched.c b/kernel/sched.c index a030d4514cdc..613fee54fc89 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -616,6 +616,7 @@ struct rq { unsigned char idle_at_tick; /* For active balancing */ + int post_schedule; int active_balance; int push_cpu; /* cpu of this runqueue: */ @@ -2839,17 +2840,11 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, * with the lock held can cause deadlocks; see schedule() for * details.) */ -static int finish_task_switch(struct rq *rq, struct task_struct *prev) +static void finish_task_switch(struct rq *rq, struct task_struct *prev) __releases(rq->lock) { struct mm_struct *mm = rq->prev_mm; long prev_state; - int post_schedule = 0; - -#ifdef CONFIG_SMP - if (current->sched_class->needs_post_schedule) - post_schedule = current->sched_class->needs_post_schedule(rq); -#endif rq->prev_mm = NULL; @@ -2880,10 +2875,44 @@ static int finish_task_switch(struct rq *rq, struct task_struct *prev) kprobe_flush_task(prev); put_task_struct(prev); } +} + +#ifdef CONFIG_SMP + +/* assumes rq->lock is held */ +static inline void pre_schedule(struct rq *rq, struct task_struct *prev) +{ + if (prev->sched_class->pre_schedule) + prev->sched_class->pre_schedule(rq, prev); +} + +/* rq->lock is NOT held, but preemption is disabled */ +static inline void post_schedule(struct rq *rq) +{ + if (rq->post_schedule) { + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + if (rq->curr->sched_class->post_schedule) + rq->curr->sched_class->post_schedule(rq); + spin_unlock_irqrestore(&rq->lock, flags); + + rq->post_schedule = 0; + } +} + +#else - return post_schedule; +static inline void pre_schedule(struct rq *rq, struct task_struct *p) +{ +} + +static inline void post_schedule(struct rq *rq) +{ } +#endif + /** * schedule_tail - first thing a freshly forked thread must call. * @prev: the thread we just switched away from. @@ -2892,14 +2921,14 @@ asmlinkage void schedule_tail(struct task_struct *prev) __releases(rq->lock) { struct rq *rq = this_rq(); - int post_schedule; - post_schedule = finish_task_switch(rq, prev); + finish_task_switch(rq, prev); -#ifdef CONFIG_SMP - if (post_schedule) - current->sched_class->post_schedule(rq); -#endif + /* + * FIXME: do we need to worry about rq being invalidated by the + * task_switch? + */ + post_schedule(rq); #ifdef __ARCH_WANT_UNLOCKED_CTXSW /* In this case, finish_task_switch does not reenable preemption */ @@ -2913,7 +2942,7 @@ asmlinkage void schedule_tail(struct task_struct *prev) * context_switch - switch to the new MM and the new * thread's register state. */ -static inline int +static inline void context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { @@ -2960,7 +2989,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * CPUs since it called schedule(), thus the 'rq' on its stack * frame will be invalid. */ - return finish_task_switch(this_rq(), prev); + finish_task_switch(this_rq(), prev); } /* @@ -5371,7 +5400,6 @@ asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; - int post_schedule = 0; struct rq *rq; int cpu; @@ -5403,10 +5431,7 @@ need_resched_nonpreemptible: switch_count = &prev->nvcsw; } -#ifdef CONFIG_SMP - if (prev->sched_class->pre_schedule) - prev->sched_class->pre_schedule(rq, prev); -#endif + pre_schedule(rq, prev); if (unlikely(!rq->nr_running)) idle_balance(cpu, rq); @@ -5422,25 +5447,17 @@ need_resched_nonpreemptible: rq->curr = next; ++*switch_count; - post_schedule = context_switch(rq, prev, next); /* unlocks the rq */ + context_switch(rq, prev, next); /* unlocks the rq */ /* * the context switch might have flipped the stack from under * us, hence refresh the local variables. */ cpu = smp_processor_id(); rq = cpu_rq(cpu); - } else { -#ifdef CONFIG_SMP - if (current->sched_class->needs_post_schedule) - post_schedule = current->sched_class->needs_post_schedule(rq); -#endif + } else spin_unlock_irq(&rq->lock); - } -#ifdef CONFIG_SMP - if (post_schedule) - current->sched_class->post_schedule(rq); -#endif + post_schedule(rq); if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; @@ -9403,6 +9420,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; + rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; rq->push_cpu = 0; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 3918e01994e0..a8f89bc3e5eb 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1056,6 +1056,11 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) return p; } +static inline int has_pushable_tasks(struct rq *rq) +{ + return !plist_head_empty(&rq->rt.pushable_tasks); +} + static struct task_struct *pick_next_task_rt(struct rq *rq) { struct task_struct *p = _pick_next_task_rt(rq); @@ -1064,6 +1069,12 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) if (p) dequeue_pushable_task(rq, p); + /* + * We detect this state here so that we can avoid taking the RQ + * lock again later if there is no need to push + */ + rq->post_schedule = has_pushable_tasks(rq); + return p; } @@ -1262,11 +1273,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) return lowest_rq; } -static inline int has_pushable_tasks(struct rq *rq) -{ - return !plist_head_empty(&rq->rt.pushable_tasks); -} - static struct task_struct *pick_next_pushable_task(struct rq *rq) { struct task_struct *p; @@ -1466,23 +1472,9 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) pull_rt_task(rq); } -/* - * assumes rq->lock is held - */ -static int needs_post_schedule_rt(struct rq *rq) -{ - return has_pushable_tasks(rq); -} - static void post_schedule_rt(struct rq *rq) { - /* - * This is only called if needs_post_schedule_rt() indicates that - * we need to push tasks away - */ - spin_lock_irq(&rq->lock); push_rt_tasks(rq); - spin_unlock_irq(&rq->lock); } /* @@ -1758,7 +1750,6 @@ static const struct sched_class rt_sched_class = { .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, .pre_schedule = pre_schedule_rt, - .needs_post_schedule = needs_post_schedule_rt, .post_schedule = post_schedule_rt, .task_wake_up = task_wake_up_rt, .switched_from = switched_from_rt, -- cgit v1.2.3 From 00aec93d10a051ea64f83eff75d4065a19508ea6 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Thu, 30 Jul 2009 10:57:23 -0400 Subject: sched: Fully integrate cpus_active_map and root-domain code Reflect "active" cpus in the rq->rd->online field, instead of the online_map. The motivation is that things that use the root-domain code (such as cpupri) only care about cpus classified as "active" anyway. By synchronizing the root-domain state with the active map, we allow several optimizations. For instance, we can remove an extra cpumask_and from the scheduler hotpath by utilizing rq->rd->online (since it is now a cached version of cpu_active_map & rq->rd->span). Signed-off-by: Gregory Haskins Acked-by: Peter Zijlstra Acked-by: Max Krasnyansky Signed-off-by: Peter Zijlstra LKML-Reference: <20090730145723.25226.24493.stgit@dev.haskins.net> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_fair.c | 10 +++++++--- kernel/sched_rt.c | 7 ------- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 613fee54fc89..475138c42548 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7927,7 +7927,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) rq->rd = rd; cpumask_set_cpu(rq->cpu, rd->span); - if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) + if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) set_rq_online(rq); spin_unlock_irqrestore(&rq->lock, flags); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 652e8bdef9aa..493472984879 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1046,17 +1046,21 @@ static void yield_task_fair(struct rq *rq) * search starts with cpus closest then further out as needed, * so we always favor a closer, idle cpu. * Domains may include CPUs that are not usable for migration, - * hence we need to mask them out (cpu_active_mask) + * hence we need to mask them out (rq->rd->online) * * Returns the CPU we should wake onto. */ #if defined(ARCH_HAS_SCHED_WAKE_IDLE) + +#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online) + static int wake_idle(int cpu, struct task_struct *p) { struct sched_domain *sd; int i; unsigned int chosen_wakeup_cpu; int this_cpu; + struct rq *task_rq = task_rq(p); /* * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu @@ -1089,10 +1093,10 @@ static int wake_idle(int cpu, struct task_struct *p) for_each_domain(cpu, sd) { if ((sd->flags & SD_WAKE_IDLE) || ((sd->flags & SD_WAKE_IDLE_FAR) - && !task_hot(p, task_rq(p)->clock, sd))) { + && !task_hot(p, task_rq->clock, sd))) { for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { - if (cpu_active(i) && idle_cpu(i)) { + if (cpu_rd_active(i, task_rq) && idle_cpu(i)) { if (i != task_cpu(p)) { schedstat_inc(p, se.nr_wakeups_idle); diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index a8f89bc3e5eb..13f728ef5b38 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1172,13 +1172,6 @@ static int find_lowest_rq(struct task_struct *task) if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) return -1; /* No targets found */ - /* - * Only consider CPUs that are usable for migration. - * I guess we might want to change cpupri_find() to ignore those - * in the first place. - */ - cpumask_and(lowest_mask, lowest_mask, cpu_active_mask); - /* * At this point we have built a mask of cpus representing the * lowest priority tasks in the system. Now we want to elect -- cgit v1.2.3 From 8f48894fcc89ddec62e1762f73a0825793e59e91 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Jul 2009 12:25:30 +0200 Subject: sched: Add debug check to task_of() A frequent mistake appears to be to call task_of() on a scheduler entity that is not actually a task, which can result in a wild pointer. Add a check to catch these mistakes. Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 20 ++++++++++++++------ kernel/sched_rt.c | 16 ++++++++++++---- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 493472984879..342000b31ad6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -79,11 +79,6 @@ static const struct sched_class fair_sched_class; * CFS operations on generic schedulable entities: */ -static inline struct task_struct *task_of(struct sched_entity *se) -{ - return container_of(se, struct task_struct, se); -} - #ifdef CONFIG_FAIR_GROUP_SCHED /* cpu runqueue to which this cfs_rq is attached */ @@ -95,6 +90,14 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) /* An entity is a task if it doesn't "own" a runqueue */ #define entity_is_task(se) (!se->my_q) +static inline struct task_struct *task_of(struct sched_entity *se) +{ +#ifdef CONFIG_SCHED_DEBUG + WARN_ON_ONCE(!entity_is_task(se)); +#endif + return container_of(se, struct task_struct, se); +} + /* Walk up scheduling entities hierarchy */ #define for_each_sched_entity(se) \ for (; se; se = se->parent) @@ -186,7 +189,12 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) } } -#else /* CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED */ + +static inline struct task_struct *task_of(struct sched_entity *se) +{ + return container_of(se, struct task_struct, se); +} static inline struct rq *rq_of(struct cfs_rq *cfs_rq) { diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 13f728ef5b38..f365e66b3d49 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -3,15 +3,18 @@ * policies) */ +#ifdef CONFIG_RT_GROUP_SCHED + +#define rt_entity_is_task(rt_se) (!(rt_se)->my_q) + static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) { +#ifdef CONFIG_SCHED_DEBUG + WARN_ON_ONCE(!rt_entity_is_task(rt_se)); +#endif return container_of(rt_se, struct task_struct, rt); } -#ifdef CONFIG_RT_GROUP_SCHED - -#define rt_entity_is_task(rt_se) (!(rt_se)->my_q) - static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) { return rt_rq->rq; @@ -26,6 +29,11 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) #define rt_entity_is_task(rt_se) (1) +static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) +{ + return container_of(rt_se, struct task_struct, rt); +} + static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) { return container_of(rt_rq, struct rq, rt); -- cgit v1.2.3 From 693525e3bea25cf2ee6cf2b862ba7c148e891df2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Jul 2009 13:56:38 +0200 Subject: sched: Ensure the migration task doesn't go away during use Like sched_migrate_task(), set_cpus_allowed_ptr() should hold onto the migration thread too. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sched.c b/kernel/sched.c index 475138c42548..7f83be35d65c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7078,8 +7078,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ + struct task_struct *mt = rq->migration_thread; + + get_task_struct(mt); task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); + put_task_struct(mt); wait_for_completion(&req.done); tlb_migrate_finish(p->mm); return 0; -- cgit v1.2.3 From bbfa26229a8143889e95e0df4a9d69067ee836cd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 2 Aug 2009 14:44:24 +0200 Subject: lockdep: Fix BFS build Fix: kernel/built-in.o: In function `lockdep_stats_show': lockdep_proc.c:(.text+0x48202): undefined reference to `max_bfs_queue_depth' As max_bfs_queue_depth is only available under CONFIG_PROVE_LOCKING=y. Cc: Ming Lei Cc: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/lockdep_proc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 9a1bf34d2ff6..fba81f16e346 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -411,8 +411,10 @@ static int lockdep_stats_show(struct seq_file *m, void *v) max_lockdep_depth); seq_printf(m, " max recursion depth: %11u\n", max_recursion_depth); +#ifdef CONFIG_PROVE_LOCKING seq_printf(m, " max bfs queue depth: %11u\n", max_bfs_queue_depth); +#endif lockdep_stats_debug_show(m); seq_printf(m, " debug_locks: %11u\n", debug_locks); -- cgit v1.2.3 From bcf08df3b23b3d13bf8c4ad6bd744a6ad30015fb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 19 Apr 2008 12:11:10 +0200 Subject: sched: Fix cpupri build on !CONFIG_SMP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This build bug: In file included from kernel/sched.c:1765: kernel/sched_rt.c: In function ‘has_pushable_tasks’: kernel/sched_rt.c:1069: error: ‘struct rt_rq’ has no member named ‘pushable_tasks’ kernel/sched_rt.c: In function ‘pick_next_task_rt’: kernel/sched_rt.c:1084: error: ‘struct rq’ has no member named ‘post_schedule’ Triggers because both pushable_tasks and post_schedule are SMP-only fields. Move pushable_tasks() to the SMP section and #ifdef the post_schedule use. Cc: Gregory Haskins Cc: Peter Zijlstra LKML-Reference: <20090729150422.17691.55590.stgit@dev.haskins.net> Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index f365e66b3d49..3d4020a9ba1b 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -136,6 +136,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); } +static inline int has_pushable_tasks(struct rq *rq) +{ + return !plist_head_empty(&rq->rt.pushable_tasks); +} + #else static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) @@ -1064,11 +1069,6 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) return p; } -static inline int has_pushable_tasks(struct rq *rq) -{ - return !plist_head_empty(&rq->rt.pushable_tasks); -} - static struct task_struct *pick_next_task_rt(struct rq *rq) { struct task_struct *p = _pick_next_task_rt(rq); @@ -1077,11 +1077,13 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) if (p) dequeue_pushable_task(rq, p); +#ifdef CONFIG_SMP /* * We detect this state here so that we can avoid taking the RQ * lock again later if there is no need to push */ rq->post_schedule = has_pushable_tasks(rq); +#endif return p; } -- cgit v1.2.3 From 4f84f4330a11b9eb828bf5af557f4c79c64614a3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Jul 2009 15:27:04 +0200 Subject: lockdep: Fix backtraces Truncate stupid -1 entries in backtraces. Signed-off-by: Peter Zijlstra LKML-Reference: <1248096665.15751.8816.camel@twins> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 1cedb00e3e7a..2f0970297e30 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -367,11 +367,21 @@ static int save_trace(struct stack_trace *trace) save_stack_trace(trace); + /* + * Some daft arches put -1 at the end to indicate its a full trace. + * + * this is buggy anyway, since it takes a whole extra entry so a + * complete trace that maxes out the entries provided will be reported + * as incomplete, friggin useless + */ + if (trace->entries[trace->nr_entries-1] == ULONG_MAX) + trace->nr_entries--; + trace->max_entries = trace->nr_entries; nr_stack_trace_entries += trace->nr_entries; - if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) { + if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { if (!debug_locks_off_graph_unlock()) return 0; -- cgit v1.2.3 From 98c33eddaf41d225d99b40f9eedbd0fac4c08c05 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Jul 2009 13:19:07 +0200 Subject: lockdep: Fix style nits fixes a few comments and whitespaces that annoyed me. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 2f0970297e30..4b6cebe8ab31 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -843,15 +843,18 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, return 1; } -/*For good efficiency of modular, we use power of 2*/ +/* + * For good efficiency of modular, we use power of 2 + */ #define MAX_CIRCULAR_QUEUE_SIZE 4096UL #define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1) -/* The circular_queue and helpers is used to implement the +/* + * The circular_queue and helpers is used to implement the * breadth-first search(BFS)algorithem, by which we can build * the shortest path from the next lock to be acquired to the * previous held lock if there is a circular between them. - * */ + */ struct circular_queue { unsigned long element[MAX_CIRCULAR_QUEUE_SIZE]; unsigned int front, rear; @@ -907,6 +910,7 @@ static inline void mark_lock_accessed(struct lock_list *lock, struct lock_list *parent) { unsigned long nr; + nr = lock - list_entries; WARN_ON(nr >= nr_list_entries); lock->parent = parent; @@ -916,6 +920,7 @@ static inline void mark_lock_accessed(struct lock_list *lock, static inline unsigned long lock_accessed(struct lock_list *lock) { unsigned long nr; + nr = lock - list_entries; WARN_ON(nr >= nr_list_entries); return test_bit(nr, bfs_accessed); -- cgit v1.2.3 From f607c6685774811b8112e124f10a053d77015485 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Jul 2009 19:16:29 +0200 Subject: lockdep: Introduce lockdep_assert_held() Add a lockdep helper to validate that we indeed are the owner of a lock. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 8 ++++++++ kernel/lockdep.c | 33 +++++++++++++++++++++++++++++++++ kernel/sched.c | 2 ++ 3 files changed, 43 insertions(+) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 12aabfcb45f6..a6d5e5e4d084 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -296,6 +296,10 @@ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass, extern void lock_release(struct lockdep_map *lock, int nested, unsigned long ip); +#define lockdep_is_held(lock) lock_is_held(&(lock)->dep_map) + +extern int lock_is_held(struct lockdep_map *lock); + extern void lock_set_class(struct lockdep_map *lock, const char *name, struct lock_class_key *key, unsigned int subclass, unsigned long ip); @@ -314,6 +318,8 @@ extern void lockdep_trace_alloc(gfp_t mask); #define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0) +#define lockdep_assert_held(l) WARN_ON(debug_locks && !lockdep_is_held(l)) + #else /* !LOCKDEP */ static inline void lockdep_off(void) @@ -358,6 +364,8 @@ struct lock_class_key { }; #define lockdep_depth(tsk) (0) +#define lockdep_assert_held(l) do { } while (0) + #endif /* !LOCKDEP */ #ifdef CONFIG_LOCK_STAT diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 4b6cebe8ab31..28914a509914 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3059,6 +3059,19 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) check_chain_key(curr); } +static int __lock_is_held(struct lockdep_map *lock) +{ + struct task_struct *curr = current; + int i; + + for (i = 0; i < curr->lockdep_depth; i++) { + if (curr->held_locks[i].instance == lock) + return 1; + } + + return 0; +} + /* * Check whether we follow the irq-flags state precisely: */ @@ -3160,6 +3173,26 @@ void lock_release(struct lockdep_map *lock, int nested, } EXPORT_SYMBOL_GPL(lock_release); +int lock_is_held(struct lockdep_map *lock) +{ + unsigned long flags; + int ret = 0; + + if (unlikely(current->lockdep_recursion)) + return ret; + + raw_local_irq_save(flags); + check_flags(flags); + + current->lockdep_recursion = 1; + ret = __lock_is_held(lock); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); + + return ret; +} +EXPORT_SYMBOL_GPL(lock_is_held); + void lockdep_set_current_reclaim_state(gfp_t gfp_mask) { current->lockdep_reclaim_gfp = gfp_mask; diff --git a/kernel/sched.c b/kernel/sched.c index 1b59e265273b..2c75f7daa439 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6609,6 +6609,8 @@ int cond_resched_lock(spinlock_t *lock) int resched = should_resched(); int ret = 0; + lockdep_assert_held(lock); + if (spin_needbreak(lock) || resched) { spin_unlock(lock); if (resched) -- cgit v1.2.3 From bb97a91e2549a7f2df9c21d32542582f549ab3ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Jul 2009 19:15:35 +0200 Subject: lockdep: Deal with many similar locks spin_lock_nest_lock() allows to take many instances of the same class, this can easily lead to overflow of MAX_LOCK_DEPTH. To avoid this overflow, we'll stop accounting instances but start reference counting the class in the held_lock structure. [ We could maintain a list of instances, if we'd move the hlock stuff into __lock_acquired(), but that would require significant modifications to the current code. ] We restrict this mode to spin_lock_nest_lock() only, because it degrades the lockdep quality due to lost of instance. For lockstat this means we don't track lock statistics for any but the first lock in the series. Currently nesting is limited to 11 bits because that was the spare space available in held_lock. This yields a 2048 instances maximium. Signed-off-by: Peter Zijlstra Cc: Marcelo Tosatti Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 4 ++- kernel/lockdep.c | 89 ++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 80 insertions(+), 13 deletions(-) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index a6d5e5e4d084..47d42eff6124 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -213,10 +213,12 @@ struct held_lock { * interrupt context: */ unsigned int irq_context:2; /* bit 0 - soft, bit 1 - hard */ - unsigned int trylock:1; + unsigned int trylock:1; /* 16 bits */ + unsigned int read:2; /* see lock_acquire() comment */ unsigned int check:2; /* see lock_acquire() comment */ unsigned int hardirqs_off:1; + unsigned int references:11; /* 32 bits */ }; /* diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 28914a509914..0bb246e21cd7 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2708,13 +2708,15 @@ EXPORT_SYMBOL_GPL(lockdep_init_map); */ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, int hardirqs_off, - struct lockdep_map *nest_lock, unsigned long ip) + struct lockdep_map *nest_lock, unsigned long ip, + int references) { struct task_struct *curr = current; struct lock_class *class = NULL; struct held_lock *hlock; unsigned int depth, id; int chain_head = 0; + int class_idx; u64 chain_key; if (!prove_locking) @@ -2762,10 +2764,24 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) return 0; + class_idx = class - lock_classes + 1; + + if (depth) { + hlock = curr->held_locks + depth - 1; + if (hlock->class_idx == class_idx && nest_lock) { + if (hlock->references) + hlock->references++; + else + hlock->references = 2; + + return 1; + } + } + hlock = curr->held_locks + depth; if (DEBUG_LOCKS_WARN_ON(!class)) return 0; - hlock->class_idx = class - lock_classes + 1; + hlock->class_idx = class_idx; hlock->acquire_ip = ip; hlock->instance = lock; hlock->nest_lock = nest_lock; @@ -2773,6 +2789,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->read = read; hlock->check = check; hlock->hardirqs_off = !!hardirqs_off; + hlock->references = references; #ifdef CONFIG_LOCK_STAT hlock->waittime_stamp = 0; hlock->holdtime_stamp = sched_clock(); @@ -2881,6 +2898,30 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, return 1; } +static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) +{ + if (hlock->instance == lock) + return 1; + + if (hlock->references) { + struct lock_class *class = lock->class_cache; + + if (!class) + class = look_up_lock_class(lock, 0); + + if (DEBUG_LOCKS_WARN_ON(!class)) + return 0; + + if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) + return 0; + + if (hlock->class_idx == class - lock_classes + 1) + return 1; + } + + return 0; +} + static int __lock_set_class(struct lockdep_map *lock, const char *name, struct lock_class_key *key, unsigned int subclass, @@ -2904,7 +2945,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name, */ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) break; - if (hlock->instance == lock) + if (match_held_lock(hlock, lock)) goto found_it; prev_hlock = hlock; } @@ -2923,7 +2964,8 @@ found_it: if (!__lock_acquire(hlock->instance, hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, - hlock->nest_lock, hlock->acquire_ip)) + hlock->nest_lock, hlock->acquire_ip, + hlock->references)) return 0; } @@ -2962,20 +3004,34 @@ lock_release_non_nested(struct task_struct *curr, */ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) break; - if (hlock->instance == lock) + if (match_held_lock(hlock, lock)) goto found_it; prev_hlock = hlock; } return print_unlock_inbalance_bug(curr, lock, ip); found_it: - lock_release_holdtime(hlock); + if (hlock->instance == lock) + lock_release_holdtime(hlock); + + if (hlock->references) { + hlock->references--; + if (hlock->references) { + /* + * We had, and after removing one, still have + * references, the current lock stack is still + * valid. We're done! + */ + return 1; + } + } /* * We have the right lock to unlock, 'hlock' points to it. * Now we remove it from the stack, and add back the other * entries (if any), recalculating the hash along the way: */ + curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; @@ -2984,7 +3040,8 @@ found_it: if (!__lock_acquire(hlock->instance, hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, - hlock->nest_lock, hlock->acquire_ip)) + hlock->nest_lock, hlock->acquire_ip, + hlock->references)) return 0; } @@ -3014,7 +3071,7 @@ static int lock_release_nested(struct task_struct *curr, /* * Is the unlock non-nested: */ - if (hlock->instance != lock) + if (hlock->instance != lock || hlock->references) return lock_release_non_nested(curr, lock, ip); curr->lockdep_depth--; @@ -3065,7 +3122,9 @@ static int __lock_is_held(struct lockdep_map *lock) int i; for (i = 0; i < curr->lockdep_depth; i++) { - if (curr->held_locks[i].instance == lock) + struct held_lock *hlock = curr->held_locks + i; + + if (match_held_lock(hlock, lock)) return 1; } @@ -3148,7 +3207,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, current->lockdep_recursion = 1; __lock_acquire(lock, subclass, trylock, read, check, - irqs_disabled_flags(flags), nest_lock, ip); + irqs_disabled_flags(flags), nest_lock, ip, 0); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } @@ -3252,7 +3311,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) */ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) break; - if (hlock->instance == lock) + if (match_held_lock(hlock, lock)) goto found_it; prev_hlock = hlock; } @@ -3260,6 +3319,9 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) return; found_it: + if (hlock->instance != lock) + return; + hlock->waittime_stamp = sched_clock(); contention_point = lock_point(hlock_class(hlock)->contention_point, ip); @@ -3299,7 +3361,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) */ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) break; - if (hlock->instance == lock) + if (match_held_lock(hlock, lock)) goto found_it; prev_hlock = hlock; } @@ -3307,6 +3369,9 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) return; found_it: + if (hlock->instance != lock) + return; + cpu = smp_processor_id(); if (hlock->waittime_stamp) { now = sched_clock(); -- cgit v1.2.3 From e351b660fddd4df76cc4635f896d311ed0ff3752 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 22 Jul 2009 22:48:09 +0800 Subject: lockdep: Reintroduce generation count to make BFS faster We still can apply DaveM's generation count optimization to BFS, based on the following idea: - before doing each BFS, increase the global generation id by 1 - if one node in the graph has been visited, mark it as visited by storing the current global generation id into the node's dep_gen_id field - so we can decide if one node has been visited already, by comparing the node's dep_gen_id with the global generation id. By applying DaveM's generation count optimization to current implementation of BFS, we gain the following advantages: - we save MAX_LOCKDEP_ENTRIES/8 bytes memory; - we remove the bitmap_zero(bfs_accessed, MAX_LOCKDEP_ENTRIES); in each BFS, which is very time-consuming since MAX_LOCKDEP_ENTRIES may be very large.(16384UL) Signed-off-by: Ming Lei Signed-off-by: Peter Zijlstra Cc: "David S. Miller" LKML-Reference: <1248274089-6358-1-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 1 + kernel/lockdep.c | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 47d42eff6124..9ccf0e286b2a 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -58,6 +58,7 @@ struct lock_class { struct lockdep_subclass_key *key; unsigned int subclass; + unsigned int dep_gen_id; /* * IRQ/softirq usage tracking bits: diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 0bb246e21cd7..2b443b5090a1 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -861,14 +861,15 @@ struct circular_queue { }; static struct circular_queue lock_cq; -static unsigned long bfs_accessed[BITS_TO_LONGS(MAX_LOCKDEP_ENTRIES)]; unsigned int max_bfs_queue_depth; +static unsigned int lockdep_dependency_gen_id; + static inline void __cq_init(struct circular_queue *cq) { cq->front = cq->rear = 0; - bitmap_zero(bfs_accessed, MAX_LOCKDEP_ENTRIES); + lockdep_dependency_gen_id++; } static inline int __cq_empty(struct circular_queue *cq) @@ -914,7 +915,7 @@ static inline void mark_lock_accessed(struct lock_list *lock, nr = lock - list_entries; WARN_ON(nr >= nr_list_entries); lock->parent = parent; - set_bit(nr, bfs_accessed); + lock->class->dep_gen_id = lockdep_dependency_gen_id; } static inline unsigned long lock_accessed(struct lock_list *lock) @@ -923,7 +924,7 @@ static inline unsigned long lock_accessed(struct lock_list *lock) nr = lock - list_entries; WARN_ON(nr >= nr_list_entries); - return test_bit(nr, bfs_accessed); + return lock->class->dep_gen_id == lockdep_dependency_gen_id; } static inline struct lock_list *get_lock_parent(struct lock_list *child) @@ -3604,7 +3605,7 @@ void __init lockdep_info(void) sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + sizeof(struct list_head) * CHAINHASH_SIZE) / 1024 #ifdef CONFIG_PROVE_LOCKING - + sizeof(struct circular_queue) + sizeof(bfs_accessed) + + sizeof(struct circular_queue) #endif ); -- cgit v1.2.3 From 90629209a020859b67423a6326f3765f220c7f5c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 2 Aug 2009 21:43:36 +0800 Subject: lockdep: Fix memory usage info of BFS The unit is KB, so sizeof(struct circular_queue) should be divided by 1024. Signed-off-by: Ming Lei Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Cc: davem@davemloft.net Cc: Ming Lei Cc: a.p.zijlstra@chello.nl LKML-Reference: <1249220616-7190-1-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 2b443b5090a1..0843584aed5e 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3603,10 +3603,11 @@ void __init lockdep_info(void) sizeof(struct list_head) * CLASSHASH_SIZE + sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + - sizeof(struct list_head) * CHAINHASH_SIZE) / 1024 + sizeof(struct list_head) * CHAINHASH_SIZE #ifdef CONFIG_PROVE_LOCKING + sizeof(struct circular_queue) #endif + ) / 1024 ); printk(" per task-struct memory footprint: %lu bytes\n", -- cgit v1.2.3 From 04b06256ccdd0af50ece04530f7dde0f81aa8f46 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Jul 2009 23:25:50 -0700 Subject: rcu: Fix RCU & CPU hotplug hang This patch divides the rcutree initialization into boot-time and hotplug-time components, so that the tree data structures are guaranteed to be fully linked at boot time regardless of what might happen in CPU hotplug operations. This should prevent the noted panic, but then again so should the pre-patch setup... Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/rcupdate.c | 12 ++++++++++- kernel/rcupreempt.c | 9 -------- kernel/rcutree.c | 62 ++++++++++++++++++++++++++++------------------------- 3 files changed, 44 insertions(+), 39 deletions(-) diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 3fea910f402c..9f0584e84bb6 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -248,8 +248,18 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, void __init rcu_init(void) { - hotcpu_notifier(rcu_barrier_cpu_hotplug, 0); + int i; + __rcu_init(); + hotcpu_notifier(rcu_barrier_cpu_hotplug, 0); + + /* + * We don't need protection against CPU-hotplug here because + * this is called early in boot, before either interrupts + * or the scheduler are operational. + */ + for_each_online_cpu(i) + rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i); } void rcu_scheduler_starting(void) diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 4300212b2107..594835363550 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -1467,15 +1467,6 @@ void __init __rcu_init(void) rdp->waitschedtail = &rdp->waitschedlist; rdp->rcu_sched_sleeping = 0; } - - /* - * We don't need protection against CPU-hotplug here because - * this is called early in boot, before either interrupts - * or the scheduler are operational. - */ - for_each_online_cpu(cpu) - rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long) cpu); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); } diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e8e9e93ed6e1..3313244e8ea3 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1325,22 +1325,40 @@ int rcu_needs_cpu(int cpu) } /* - * Initialize a CPU's per-CPU RCU data. We take this "scorched earth" - * approach so that we don't have to worry about how long the CPU has - * been gone, or whether it ever was online previously. We do trust the - * ->mynode field, as it is constant for a given struct rcu_data and - * initialized during early boot. - * - * Note that only one online or offline event can be happening at a given - * time. Note also that we can accept some slop in the rsp->completed - * access due to the fact that this CPU cannot possibly have any RCU - * callbacks in flight yet. + * Do boot-time initialization of a CPU's per-CPU RCU data. + */ +static void __init +rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) +{ + unsigned long flags; + int i; + struct rcu_data *rdp = rsp->rda[cpu]; + struct rcu_node *rnp = rcu_get_root(rsp); + + /* Set up local state, ensuring consistent view of global state. */ + spin_lock_irqsave(&rnp->lock, flags); + rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); + rdp->nxtlist = NULL; + for (i = 0; i < RCU_NEXT_SIZE; i++) + rdp->nxttail[i] = &rdp->nxtlist; + rdp->qlen = 0; +#ifdef CONFIG_NO_HZ + rdp->dynticks = &per_cpu(rcu_dynticks, cpu); +#endif /* #ifdef CONFIG_NO_HZ */ + rdp->cpu = cpu; + spin_unlock_irqrestore(&rnp->lock, flags); +} + +/* + * Initialize a CPU's per-CPU RCU data. Note that only one online or + * offline event can be happening at a given time. Note also that we + * can accept some slop in the rsp->completed access due to the fact + * that this CPU cannot possibly have any RCU callbacks in flight yet. */ static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp) { unsigned long flags; - int i; long lastcomp; unsigned long mask; struct rcu_data *rdp = rsp->rda[cpu]; @@ -1355,16 +1373,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->qs_pending = 1; /* so set up to respond to current GP. */ rdp->beenonline = 1; /* We have now been online. */ rdp->passed_quiesc_completed = lastcomp - 1; - rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; - rdp->qlen = 0; rdp->blimit = blimit; -#ifdef CONFIG_NO_HZ - rdp->dynticks = &per_cpu(rcu_dynticks, cpu); -#endif /* #ifdef CONFIG_NO_HZ */ - rdp->cpu = cpu; spin_unlock(&rnp->lock); /* irqs remain disabled. */ /* @@ -1534,17 +1543,12 @@ void __init __rcu_init(void) #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ rcu_init_one(&rcu_state); RCU_DATA_PTR_INIT(&rcu_state, rcu_data); + for_each_possible_cpu(i) + rcu_boot_init_percpu_data(i, &rcu_state); rcu_init_one(&rcu_bh_state); RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data); - - /* - * We don't need protection against CPU-hotplug here because - * this is called early in boot, before either interrupts - * or the scheduler are operational. - */ - for_each_online_cpu(i) - rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i); - + for_each_possible_cpu(i) + rcu_boot_init_percpu_data(i, &rcu_bh_state); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); } -- cgit v1.2.3 From 7256cf0e83bf018be8a81806593aaef7f2437f0b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 2 Aug 2009 10:21:10 -0700 Subject: rcu: Add diagnostic check for a possible CPU-hotplug race Complain if the RCU softirq code ever runs on a CPU that has not yet been announced to RCU as being online. Signed-off-by: Paul E. McKenney LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 3313244e8ea3..b9b1928e8be7 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1132,6 +1132,8 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; + WARN_ON_ONCE(rdp->beenonline == 0); + /* * If an RCU GP has gone long enough, go check for dyntick * idle CPUs and, if needed, send resched IPIs. -- cgit v1.2.3 From 25f6e89bedd29cc49bfa0d55497e91a671b9ae6e Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Thu, 30 Jul 2009 23:21:18 +0200 Subject: x86: Remove superfluous NULL pointer check in destroy_irq() This takes care of the following entry from Dan's list: arch/x86/kernel/apic/io_apic.c +3241 destroy_irq(11) warning: variable derefenced before check 'desc' Reported-by: Dan Carpenter Signed-off-by: Bartlomiej Zolnierkiewicz Cc: Jonathan Corbet Cc: Eugene Teo Cc: Julia Lawall LKML-Reference: <200907302321.19086.bzolnier@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index cf51b0b58c56..7e92a9212fd7 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3185,8 +3185,7 @@ void destroy_irq(unsigned int irq) cfg = desc->chip_data; dynamic_irq_cleanup(irq); /* connect back irq_cfg */ - if (desc) - desc->chip_data = cfg; + desc->chip_data = cfg; free_irte(irq); spin_lock_irqsave(&vector_lock, flags); -- cgit v1.2.3 From 47cab6a722d44c71c4f8224017ef548522243cf4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 3 Aug 2009 09:31:54 +0200 Subject: debug lockups: Improve lockup detection, fix generic arch fallback As Andrew noted, my previous patch ("debug lockups: Improve lockup detection") broke/removed SysRq-L support from architecture that do not provide a __trigger_all_cpu_backtrace implementation. Restore a fallback path and clean up the SysRq-L machinery a bit: - Rename the arch method to arch_trigger_all_cpu_backtrace() - Simplify the define - Document the method a bit - in the hope of more architectures adding support for it. [ The patch touches Sparc code for the rename. ] Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Andrew Morton Cc: Linus Torvalds Cc: "David S. Miller" LKML-Reference: <20090802140809.7ec4bb6b.akpm@linux-foundation.org> Signed-off-by: Ingo Molnar --- arch/sparc/include/asm/irq_64.h | 4 ++-- arch/sparc/kernel/process_64.c | 4 ++-- arch/x86/include/asm/nmi.h | 4 ++-- arch/x86/kernel/apic/nmi.c | 2 +- drivers/char/sysrq.c | 15 ++++++++++++++- include/linux/nmi.h | 19 +++++++++++++++++-- 6 files changed, 38 insertions(+), 10 deletions(-) diff --git a/arch/sparc/include/asm/irq_64.h b/arch/sparc/include/asm/irq_64.h index 1934f2cbf513..a0b443cb3c1f 100644 --- a/arch/sparc/include/asm/irq_64.h +++ b/arch/sparc/include/asm/irq_64.h @@ -89,8 +89,8 @@ static inline unsigned long get_softint(void) return retval; } -void __trigger_all_cpu_backtrace(void); -#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace() +void arch_trigger_all_cpu_backtrace(void); +#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace extern void *hardirq_stack[NR_CPUS]; extern void *softirq_stack[NR_CPUS]; diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 4041f94e7724..18d67854a1b8 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -251,7 +251,7 @@ static void __global_reg_poll(struct global_reg_snapshot *gp) } } -void __trigger_all_cpu_backtrace(void) +void arch_trigger_all_cpu_backtrace(void) { struct thread_info *tp = current_thread_info(); struct pt_regs *regs = get_irq_regs(); @@ -304,7 +304,7 @@ void __trigger_all_cpu_backtrace(void) static void sysrq_handle_globreg(int key, struct tty_struct *tty) { - __trigger_all_cpu_backtrace(); + arch_trigger_all_cpu_backtrace(); } static struct sysrq_key_op sparc_globalreg_op = { diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index c86e5ed4af51..e63cf7d441e1 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -45,8 +45,8 @@ extern int proc_nmi_enabled(struct ctl_table *, int , struct file *, void __user *, size_t *, loff_t *); extern int unknown_nmi_panic; -void __trigger_all_cpu_backtrace(void); -#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace() +void arch_trigger_all_cpu_backtrace(void); +#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace static inline void localise_nmi_watchdog(void) { diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 1bb1ac20e9ec..db7220220d09 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -554,7 +554,7 @@ int do_nmi_callback(struct pt_regs *regs, int cpu) return 0; } -void __trigger_all_cpu_backtrace(void) +void arch_trigger_all_cpu_backtrace(void) { int i; diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 165f307f30e8..50eecfe1d724 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -223,7 +223,20 @@ static DECLARE_WORK(sysrq_showallcpus, sysrq_showregs_othercpus); static void sysrq_handle_showallcpus(int key, struct tty_struct *tty) { - trigger_all_cpu_backtrace(); + /* + * Fall back to the workqueue based printing if the + * backtrace printing did not succeed or the + * architecture has no support for it: + */ + if (!trigger_all_cpu_backtrace()) { + struct pt_regs *regs = get_irq_regs(); + + if (regs) { + printk(KERN_INFO "CPU%d:\n", smp_processor_id()); + show_regs(regs); + } + schedule_work(&sysrq_showallcpus); + } } static struct sysrq_key_op sysrq_showallcpus_op = { diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 29af2d5df097..b752e807adde 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -28,8 +28,23 @@ static inline void acpi_nmi_disable(void) { } static inline void acpi_nmi_enable(void) { } #endif -#ifndef trigger_all_cpu_backtrace -#define trigger_all_cpu_backtrace() do { } while (0) +/* + * Create trigger_all_cpu_backtrace() out of the arch-provided + * base function. Return whether such support was available, + * to allow calling code to fall back to some other mechanism: + */ +#ifdef arch_trigger_all_cpu_backtrace +static inline bool trigger_all_cpu_backtrace(void) +{ + arch_trigger_all_cpu_backtrace(); + + return true; +} +#else +static inline bool trigger_all_cpu_backtrace(void) +{ + return false; +} #endif #endif -- cgit v1.2.3 From 42c4ab41a176ee784c0f28c0b29025a8fc34f05a Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 29 Jul 2009 12:15:26 +0200 Subject: itimers: Merge ITIMER_VIRT and ITIMER_PROF Both cpu itimers have same data flow in the few places, this patch make unification of code related with VIRT and PROF itimers. Signed-off-by: Stanislaw Gruszka Acked-by: Peter Zijlstra Acked-by: Thomas Gleixner Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul Mackerras Cc: Benjamin Herrenschmidt LKML-Reference: <1248862529-6063-2-git-send-email-sgruszka@redhat.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 14 ++++- kernel/fork.c | 9 +-- kernel/itimer.c | 146 +++++++++++++++++++++------------------------- kernel/posix-cpu-timers.c | 98 +++++++++++++++---------------- 4 files changed, 130 insertions(+), 137 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 3ab08e4bb6b8..3b3efaddd953 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -470,6 +470,11 @@ struct pacct_struct { unsigned long ac_minflt, ac_majflt; }; +struct cpu_itimer { + cputime_t expires; + cputime_t incr; +}; + /** * struct task_cputime - collected CPU time counts * @utime: time spent in user mode, in &cputime_t units @@ -564,9 +569,12 @@ struct signal_struct { struct pid *leader_pid; ktime_t it_real_incr; - /* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */ - cputime_t it_prof_expires, it_virt_expires; - cputime_t it_prof_incr, it_virt_incr; + /* + * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use + * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these + * values are defined to 0 and 1 respectively + */ + struct cpu_itimer it[2]; /* * Thread group totals for process CPU timers. diff --git a/kernel/fork.c b/kernel/fork.c index 29b532e718f7..893ab0bf5e39 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -790,10 +791,10 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) thread_group_cputime_init(sig); /* Expiration times and increments. */ - sig->it_virt_expires = cputime_zero; - sig->it_virt_incr = cputime_zero; - sig->it_prof_expires = cputime_zero; - sig->it_prof_incr = cputime_zero; + sig->it[CPUCLOCK_PROF].expires = cputime_zero; + sig->it[CPUCLOCK_PROF].incr = cputime_zero; + sig->it[CPUCLOCK_VIRT].expires = cputime_zero; + sig->it[CPUCLOCK_VIRT].incr = cputime_zero; /* Cached expiration times. */ sig->cputime_expires.prof_exp = cputime_zero; diff --git a/kernel/itimer.c b/kernel/itimer.c index 58762f7077ec..852c88ddd1f0 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -41,10 +41,43 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer) return ktime_to_timeval(rem); } +static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, + struct itimerval *value) +{ + cputime_t cval, cinterval; + struct cpu_itimer *it = &tsk->signal->it[clock_id]; + + spin_lock_irq(&tsk->sighand->siglock); + + cval = it->expires; + cinterval = it->incr; + if (!cputime_eq(cval, cputime_zero)) { + struct task_cputime cputime; + cputime_t t; + + thread_group_cputimer(tsk, &cputime); + if (clock_id == CPUCLOCK_PROF) + t = cputime_add(cputime.utime, cputime.stime); + else + /* CPUCLOCK_VIRT */ + t = cputime.utime; + + if (cputime_le(cval, t)) + /* about to fire */ + cval = jiffies_to_cputime(1); + else + cval = cputime_sub(cval, t); + } + + spin_unlock_irq(&tsk->sighand->siglock); + + cputime_to_timeval(cval, &value->it_value); + cputime_to_timeval(cinterval, &value->it_interval); +} + int do_getitimer(int which, struct itimerval *value) { struct task_struct *tsk = current; - cputime_t cinterval, cval; switch (which) { case ITIMER_REAL: @@ -55,44 +88,10 @@ int do_getitimer(int which, struct itimerval *value) spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_virt_expires; - cinterval = tsk->signal->it_virt_incr; - if (!cputime_eq(cval, cputime_zero)) { - struct task_cputime cputime; - cputime_t utime; - - thread_group_cputimer(tsk, &cputime); - utime = cputime.utime; - if (cputime_le(cval, utime)) { /* about to fire */ - cval = jiffies_to_cputime(1); - } else { - cval = cputime_sub(cval, utime); - } - } - spin_unlock_irq(&tsk->sighand->siglock); - cputime_to_timeval(cval, &value->it_value); - cputime_to_timeval(cinterval, &value->it_interval); + get_cpu_itimer(tsk, CPUCLOCK_VIRT, value); break; case ITIMER_PROF: - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_prof_expires; - cinterval = tsk->signal->it_prof_incr; - if (!cputime_eq(cval, cputime_zero)) { - struct task_cputime times; - cputime_t ptime; - - thread_group_cputimer(tsk, ×); - ptime = cputime_add(times.utime, times.stime); - if (cputime_le(cval, ptime)) { /* about to fire */ - cval = jiffies_to_cputime(1); - } else { - cval = cputime_sub(cval, ptime); - } - } - spin_unlock_irq(&tsk->sighand->siglock); - cputime_to_timeval(cval, &value->it_value); - cputime_to_timeval(cinterval, &value->it_interval); + get_cpu_itimer(tsk, CPUCLOCK_PROF, value); break; default: return(-EINVAL); @@ -128,6 +127,36 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } +static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, + struct itimerval *value, struct itimerval *ovalue) +{ + cputime_t cval, cinterval, nval, ninterval; + struct cpu_itimer *it = &tsk->signal->it[clock_id]; + + nval = timeval_to_cputime(&value->it_value); + ninterval = timeval_to_cputime(&value->it_interval); + + spin_lock_irq(&tsk->sighand->siglock); + + cval = it->expires; + cinterval = it->incr; + if (!cputime_eq(cval, cputime_zero) || + !cputime_eq(nval, cputime_zero)) { + if (cputime_gt(nval, cputime_zero)) + nval = cputime_add(nval, jiffies_to_cputime(1)); + set_process_cpu_timer(tsk, clock_id, &nval, &cval); + } + it->expires = nval; + it->incr = ninterval; + + spin_unlock_irq(&tsk->sighand->siglock); + + if (ovalue) { + cputime_to_timeval(cval, &ovalue->it_value); + cputime_to_timeval(cinterval, &ovalue->it_interval); + } +} + /* * Returns true if the timeval is in canonical form */ @@ -139,7 +168,6 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) struct task_struct *tsk = current; struct hrtimer *timer; ktime_t expires; - cputime_t cval, cinterval, nval, ninterval; /* * Validate the timevals in value. @@ -174,48 +202,10 @@ again: spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: - nval = timeval_to_cputime(&value->it_value); - ninterval = timeval_to_cputime(&value->it_interval); - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_virt_expires; - cinterval = tsk->signal->it_virt_incr; - if (!cputime_eq(cval, cputime_zero) || - !cputime_eq(nval, cputime_zero)) { - if (cputime_gt(nval, cputime_zero)) - nval = cputime_add(nval, - jiffies_to_cputime(1)); - set_process_cpu_timer(tsk, CPUCLOCK_VIRT, - &nval, &cval); - } - tsk->signal->it_virt_expires = nval; - tsk->signal->it_virt_incr = ninterval; - spin_unlock_irq(&tsk->sighand->siglock); - if (ovalue) { - cputime_to_timeval(cval, &ovalue->it_value); - cputime_to_timeval(cinterval, &ovalue->it_interval); - } + set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue); break; case ITIMER_PROF: - nval = timeval_to_cputime(&value->it_value); - ninterval = timeval_to_cputime(&value->it_interval); - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_prof_expires; - cinterval = tsk->signal->it_prof_incr; - if (!cputime_eq(cval, cputime_zero) || - !cputime_eq(nval, cputime_zero)) { - if (cputime_gt(nval, cputime_zero)) - nval = cputime_add(nval, - jiffies_to_cputime(1)); - set_process_cpu_timer(tsk, CPUCLOCK_PROF, - &nval, &cval); - } - tsk->signal->it_prof_expires = nval; - tsk->signal->it_prof_incr = ninterval; - spin_unlock_irq(&tsk->sighand->siglock); - if (ovalue) { - cputime_to_timeval(cval, &ovalue->it_value); - cputime_to_timeval(cinterval, &ovalue->it_interval); - } + set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue); break; default: return -EINVAL; diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index bece7c0b67b2..9b2d5e4dc8c4 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -14,11 +14,11 @@ */ void update_rlimit_cpu(unsigned long rlim_new) { - cputime_t cputime; + cputime_t cputime = secs_to_cputime(rlim_new); + struct signal_struct *const sig = current->signal; - cputime = secs_to_cputime(rlim_new); - if (cputime_eq(current->signal->it_prof_expires, cputime_zero) || - cputime_gt(current->signal->it_prof_expires, cputime)) { + if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) || + cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) { spin_lock_irq(¤t->sighand->siglock); set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); spin_unlock_irq(¤t->sighand->siglock); @@ -613,6 +613,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) break; } } else { + struct signal_struct *const sig = p->signal; + union cpu_time_count *exp = &timer->it.cpu.expires; + /* * For a process timer, set the cached expiration time. */ @@ -620,30 +623,27 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) default: BUG(); case CPUCLOCK_VIRT: - if (!cputime_eq(p->signal->it_virt_expires, + if (!cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) && - cputime_lt(p->signal->it_virt_expires, - timer->it.cpu.expires.cpu)) + cputime_lt(sig->it[CPUCLOCK_VIRT].expires, + exp->cpu)) break; - p->signal->cputime_expires.virt_exp = - timer->it.cpu.expires.cpu; + sig->cputime_expires.virt_exp = exp->cpu; break; case CPUCLOCK_PROF: - if (!cputime_eq(p->signal->it_prof_expires, + if (!cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) && - cputime_lt(p->signal->it_prof_expires, - timer->it.cpu.expires.cpu)) + cputime_lt(sig->it[CPUCLOCK_PROF].expires, + exp->cpu)) break; - i = p->signal->rlim[RLIMIT_CPU].rlim_cur; + i = sig->rlim[RLIMIT_CPU].rlim_cur; if (i != RLIM_INFINITY && - i <= cputime_to_secs(timer->it.cpu.expires.cpu)) + i <= cputime_to_secs(exp->cpu)) break; - p->signal->cputime_expires.prof_exp = - timer->it.cpu.expires.cpu; + sig->cputime_expires.prof_exp = exp->cpu; break; case CPUCLOCK_SCHED: - p->signal->cputime_expires.sched_exp = - timer->it.cpu.expires.sched; + sig->cputime_expires.sched_exp = exp->sched; break; } } @@ -1070,6 +1070,27 @@ static void stop_process_timers(struct task_struct *tsk) spin_unlock_irqrestore(&cputimer->lock, flags); } +static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, + cputime_t *expires, cputime_t cur_time, int signo) +{ + if (cputime_eq(it->expires, cputime_zero)) + return; + + if (cputime_ge(cur_time, it->expires)) { + it->expires = it->incr; + if (!cputime_eq(it->expires, cputime_zero)) + it->expires = cputime_add(it->expires, cur_time); + + __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); + } + + if (!cputime_eq(it->expires, cputime_zero) && + (cputime_eq(*expires, cputime_zero) || + cputime_lt(it->expires, *expires))) { + *expires = it->expires; + } +} + /* * Check for any per-thread CPU timers that have fired and move them * off the tsk->*_timers list onto the firing list. Per-thread timers @@ -1089,10 +1110,10 @@ static void check_process_timers(struct task_struct *tsk, * Don't sample the current process CPU clocks if there are no timers. */ if (list_empty(&timers[CPUCLOCK_PROF]) && - cputime_eq(sig->it_prof_expires, cputime_zero) && + cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) && sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && list_empty(&timers[CPUCLOCK_VIRT]) && - cputime_eq(sig->it_virt_expires, cputime_zero) && + cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) && list_empty(&timers[CPUCLOCK_SCHED])) { stop_process_timers(tsk); return; @@ -1152,38 +1173,11 @@ static void check_process_timers(struct task_struct *tsk, /* * Check for the special case process timers. */ - if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { - if (cputime_ge(ptime, sig->it_prof_expires)) { - /* ITIMER_PROF fires and reloads. */ - sig->it_prof_expires = sig->it_prof_incr; - if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { - sig->it_prof_expires = cputime_add( - sig->it_prof_expires, ptime); - } - __group_send_sig_info(SIGPROF, SEND_SIG_PRIV, tsk); - } - if (!cputime_eq(sig->it_prof_expires, cputime_zero) && - (cputime_eq(prof_expires, cputime_zero) || - cputime_lt(sig->it_prof_expires, prof_expires))) { - prof_expires = sig->it_prof_expires; - } - } - if (!cputime_eq(sig->it_virt_expires, cputime_zero)) { - if (cputime_ge(utime, sig->it_virt_expires)) { - /* ITIMER_VIRTUAL fires and reloads. */ - sig->it_virt_expires = sig->it_virt_incr; - if (!cputime_eq(sig->it_virt_expires, cputime_zero)) { - sig->it_virt_expires = cputime_add( - sig->it_virt_expires, utime); - } - __group_send_sig_info(SIGVTALRM, SEND_SIG_PRIV, tsk); - } - if (!cputime_eq(sig->it_virt_expires, cputime_zero) && - (cputime_eq(virt_expires, cputime_zero) || - cputime_lt(sig->it_virt_expires, virt_expires))) { - virt_expires = sig->it_virt_expires; - } - } + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime, + SIGPROF); + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, + SIGVTALRM); + if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { unsigned long psecs = cputime_to_secs(ptime); cputime_t x; -- cgit v1.2.3 From 8356b5f9c424e5831715abbce747197c30d1fd71 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 29 Jul 2009 12:15:27 +0200 Subject: itimers: Fix periodic tics precision Measure ITIMER_PROF and ITIMER_VIRT timers interval error between real ticks and requested by user. Take it into account when scheduling next tick. This patch introduce possibility where time between two consecutive tics is smaller then requested interval, it preserve however dependency that n tick is generated not earlier than n*interval time - counting from the beginning of periodic signal generation. Signed-off-by: Stanislaw Gruszka Acked-by: Peter Zijlstra Acked-by: Thomas Gleixner Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul Mackerras Cc: Benjamin Herrenschmidt LKML-Reference: <1248862529-6063-3-git-send-email-sgruszka@redhat.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ kernel/itimer.c | 24 +++++++++++++++++++++--- kernel/posix-cpu-timers.c | 20 +++++++++++++++++--- 3 files changed, 40 insertions(+), 6 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 3b3efaddd953..a069e65e8bb7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -473,6 +473,8 @@ struct pacct_struct { struct cpu_itimer { cputime_t expires; cputime_t incr; + u32 error; + u32 incr_error; }; /** diff --git a/kernel/itimer.c b/kernel/itimer.c index 852c88ddd1f0..21adff7b2a17 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -42,7 +42,7 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer) } static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, - struct itimerval *value) + struct itimerval *const value) { cputime_t cval, cinterval; struct cpu_itimer *it = &tsk->signal->it[clock_id]; @@ -127,14 +127,32 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } +static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns) +{ + struct timespec ts; + s64 cpu_ns; + + cputime_to_timespec(ct, &ts); + cpu_ns = timespec_to_ns(&ts); + + return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns; +} + static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, - struct itimerval *value, struct itimerval *ovalue) + const struct itimerval *const value, + struct itimerval *const ovalue) { - cputime_t cval, cinterval, nval, ninterval; + cputime_t cval, nval, cinterval, ninterval; + s64 ns_ninterval, ns_nval; struct cpu_itimer *it = &tsk->signal->it[clock_id]; nval = timeval_to_cputime(&value->it_value); + ns_nval = timeval_to_ns(&value->it_value); ninterval = timeval_to_cputime(&value->it_interval); + ns_ninterval = timeval_to_ns(&value->it_interval); + + it->incr_error = cputime_sub_ns(ninterval, ns_ninterval); + it->error = cputime_sub_ns(nval, ns_nval); spin_lock_irq(&tsk->sighand->siglock); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 9b2d5e4dc8c4..b60d644ea4b3 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1070,6 +1070,8 @@ static void stop_process_timers(struct task_struct *tsk) spin_unlock_irqrestore(&cputimer->lock, flags); } +static u32 onecputick; + static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, cputime_t *expires, cputime_t cur_time, int signo) { @@ -1077,9 +1079,16 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, return; if (cputime_ge(cur_time, it->expires)) { - it->expires = it->incr; - if (!cputime_eq(it->expires, cputime_zero)) - it->expires = cputime_add(it->expires, cur_time); + if (!cputime_eq(it->incr, cputime_zero)) { + it->expires = cputime_add(it->expires, it->incr); + it->error += it->incr_error; + if (it->error >= onecputick) { + it->expires = cputime_sub(it->expires, + jiffies_to_cputime(1)); + it->error -= onecputick; + } + } else + it->expires = cputime_zero; __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); } @@ -1696,10 +1705,15 @@ static __init int init_posix_cpu_timers(void) .nsleep = thread_cpu_nsleep, .nsleep_restart = thread_cpu_nsleep_restart, }; + struct timespec ts; register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); + cputime_to_timespec(jiffies_to_cputime(1), &ts); + onecputick = ts.tv_nsec; + WARN_ON(ts.tv_sec != 0); + return 0; } __initcall(init_posix_cpu_timers); -- cgit v1.2.3 From d1e3b6d195770bd422e3229b88edfc154b6a27dd Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 29 Jul 2009 12:15:28 +0200 Subject: itimers: Simplify arm_timer() code a bit Don't update values in expiration cache when new ones are equal. Add expire_le() and expire_gt() helpers to simplify the code. Signed-off-by: Stanislaw Gruszka Acked-by: Peter Zijlstra Acked-by: Thomas Gleixner Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul Mackerras Cc: Benjamin Herrenschmidt LKML-Reference: <1248862529-6063-4-git-send-email-sgruszka@redhat.com> Signed-off-by: Ingo Molnar --- kernel/posix-cpu-timers.c | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index b60d644ea4b3..69c92374355f 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -541,6 +541,17 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) now); } +static inline int expires_gt(cputime_t expires, cputime_t new_exp) +{ + return cputime_eq(expires, cputime_zero) || + cputime_gt(expires, new_exp); +} + +static inline int expires_le(cputime_t expires, cputime_t new_exp) +{ + return !cputime_eq(expires, cputime_zero) && + cputime_le(expires, new_exp); +} /* * Insert the timer on the appropriate list before any timers that * expire later. This must be called with the tasklist_lock held @@ -585,31 +596,26 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) */ if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + union cpu_time_count *exp = &nt->expires; + switch (CPUCLOCK_WHICH(timer->it_clock)) { default: BUG(); case CPUCLOCK_PROF: - if (cputime_eq(p->cputime_expires.prof_exp, - cputime_zero) || - cputime_gt(p->cputime_expires.prof_exp, - nt->expires.cpu)) - p->cputime_expires.prof_exp = - nt->expires.cpu; + if (expires_gt(p->cputime_expires.prof_exp, + exp->cpu)) + p->cputime_expires.prof_exp = exp->cpu; break; case CPUCLOCK_VIRT: - if (cputime_eq(p->cputime_expires.virt_exp, - cputime_zero) || - cputime_gt(p->cputime_expires.virt_exp, - nt->expires.cpu)) - p->cputime_expires.virt_exp = - nt->expires.cpu; + if (expires_gt(p->cputime_expires.virt_exp, + exp->cpu)) + p->cputime_expires.virt_exp = exp->cpu; break; case CPUCLOCK_SCHED: if (p->cputime_expires.sched_exp == 0 || - p->cputime_expires.sched_exp > - nt->expires.sched) + p->cputime_expires.sched_exp > exp->sched) p->cputime_expires.sched_exp = - nt->expires.sched; + exp->sched; break; } } else { @@ -623,17 +629,13 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) default: BUG(); case CPUCLOCK_VIRT: - if (!cputime_eq(sig->it[CPUCLOCK_VIRT].expires, - cputime_zero) && - cputime_lt(sig->it[CPUCLOCK_VIRT].expires, + if (expires_le(sig->it[CPUCLOCK_VIRT].expires, exp->cpu)) break; sig->cputime_expires.virt_exp = exp->cpu; break; case CPUCLOCK_PROF: - if (!cputime_eq(sig->it[CPUCLOCK_PROF].expires, - cputime_zero) && - cputime_lt(sig->it[CPUCLOCK_PROF].expires, + if (expires_le(sig->it[CPUCLOCK_PROF].expires, exp->cpu)) break; i = sig->rlim[RLIMIT_CPU].rlim_cur; -- cgit v1.2.3 From a42548a18866e87092db93b771e6c5b060d78401 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 29 Jul 2009 12:15:29 +0200 Subject: cputime: Optimize jiffies_to_cputime(1) For powerpc with CONFIG_VIRT_CPU_ACCOUNTING jiffies_to_cputime(1) is not compile time constant and run time calculations are quite expensive. To optimize we use precomputed value. For all other architectures is is preprocessor definition. Signed-off-by: Stanislaw Gruszka Acked-by: Peter Zijlstra Acked-by: Thomas Gleixner Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul Mackerras Cc: Benjamin Herrenschmidt LKML-Reference: <1248862529-6063-5-git-send-email-sgruszka@redhat.com> Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/cputime.h | 1 + arch/powerpc/include/asm/cputime.h | 13 +++++++++++++ arch/powerpc/kernel/time.c | 4 ++++ arch/s390/include/asm/cputime.h | 1 + include/asm-generic/cputime.h | 1 + kernel/itimer.c | 4 ++-- kernel/posix-cpu-timers.c | 6 +++--- kernel/sched.c | 9 ++++----- 8 files changed, 29 insertions(+), 10 deletions(-) diff --git a/arch/ia64/include/asm/cputime.h b/arch/ia64/include/asm/cputime.h index d20b998cb91d..7fa8a8594660 100644 --- a/arch/ia64/include/asm/cputime.h +++ b/arch/ia64/include/asm/cputime.h @@ -30,6 +30,7 @@ typedef u64 cputime_t; typedef u64 cputime64_t; #define cputime_zero ((cputime_t)0) +#define cputime_one_jiffy jiffies_to_cputime(1) #define cputime_max ((~((cputime_t)0) >> 1) - 1) #define cputime_add(__a, __b) ((__a) + (__b)) #define cputime_sub(__a, __b) ((__a) - (__b)) diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index f42e623030ee..fa19f3fe05ff 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -18,6 +18,9 @@ #ifndef CONFIG_VIRT_CPU_ACCOUNTING #include +#ifdef __KERNEL__ +static inline void setup_cputime_one_jiffy(void) { } +#endif #else #include @@ -48,6 +51,11 @@ typedef u64 cputime64_t; #ifdef __KERNEL__ +/* + * One jiffy in timebase units computed during initialization + */ +extern cputime_t cputime_one_jiffy; + /* * Convert cputime <-> jiffies */ @@ -89,6 +97,11 @@ static inline cputime_t jiffies_to_cputime(const unsigned long jif) return ct; } +static inline void setup_cputime_one_jiffy(void) +{ + cputime_one_jiffy = jiffies_to_cputime(1); +} + static inline cputime64_t jiffies64_to_cputime64(const u64 jif) { cputime_t ct; diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index eae4511ceeac..211d7b0cd370 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -193,6 +193,8 @@ EXPORT_SYMBOL(__cputime_clockt_factor); DEFINE_PER_CPU(unsigned long, cputime_last_delta); DEFINE_PER_CPU(unsigned long, cputime_scaled_last_delta); +cputime_t cputime_one_jiffy; + static void calc_cputime_factors(void) { struct div_result res; @@ -500,6 +502,7 @@ static int __init iSeries_tb_recal(void) tb_to_xs = divres.result_low; vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; vdso_data->tb_to_xs = tb_to_xs; + setup_cputime_one_jiffy(); } else { printk( "Titan recalibrate: FAILED (difference > 4 percent)\n" @@ -945,6 +948,7 @@ void __init time_init(void) tb_ticks_per_usec = ppc_tb_freq / 1000000; tb_to_us = mulhwu_scale_factor(ppc_tb_freq, 1000000); calc_cputime_factors(); + setup_cputime_one_jiffy(); /* * Calculate the length of each tick in ns. It will not be diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index 7a3817a656df..24b1244aadb9 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h @@ -42,6 +42,7 @@ __div(unsigned long long n, unsigned int base) #endif /* __s390x__ */ #define cputime_zero (0ULL) +#define cputime_one_jiffy jiffies_to_cputime(1) #define cputime_max ((~0UL >> 1) - 1) #define cputime_add(__a, __b) ((__a) + (__b)) #define cputime_sub(__a, __b) ((__a) - (__b)) diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h index 1c1fa422d18a..ca0f239f0e13 100644 --- a/include/asm-generic/cputime.h +++ b/include/asm-generic/cputime.h @@ -7,6 +7,7 @@ typedef unsigned long cputime_t; #define cputime_zero (0UL) +#define cputime_one_jiffy jiffies_to_cputime(1) #define cputime_max ((~0UL >> 1) - 1) #define cputime_add(__a, __b) ((__a) + (__b)) #define cputime_sub(__a, __b) ((__a) - (__b)) diff --git a/kernel/itimer.c b/kernel/itimer.c index 21adff7b2a17..8078a32d3b10 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -64,7 +64,7 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, if (cputime_le(cval, t)) /* about to fire */ - cval = jiffies_to_cputime(1); + cval = cputime_one_jiffy; else cval = cputime_sub(cval, t); } @@ -161,7 +161,7 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, if (!cputime_eq(cval, cputime_zero) || !cputime_eq(nval, cputime_zero)) { if (cputime_gt(nval, cputime_zero)) - nval = cputime_add(nval, jiffies_to_cputime(1)); + nval = cputime_add(nval, cputime_one_jiffy); set_process_cpu_timer(tsk, clock_id, &nval, &cval); } it->expires = nval; diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 69c92374355f..18bdde6f676f 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1086,7 +1086,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, it->error += it->incr_error; if (it->error >= onecputick) { it->expires = cputime_sub(it->expires, - jiffies_to_cputime(1)); + cputime_one_jiffy); it->error -= onecputick; } } else @@ -1461,7 +1461,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, if (!cputime_eq(*oldval, cputime_zero)) { if (cputime_le(*oldval, now.cpu)) { /* Just about to fire. */ - *oldval = jiffies_to_cputime(1); + *oldval = cputime_one_jiffy; } else { *oldval = cputime_sub(*oldval, now.cpu); } @@ -1712,7 +1712,7 @@ static __init int init_posix_cpu_timers(void) register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); - cputime_to_timespec(jiffies_to_cputime(1), &ts); + cputime_to_timespec(cputime_one_jiffy, &ts); onecputick = ts.tv_nsec; WARN_ON(ts.tv_sec != 0); diff --git a/kernel/sched.c b/kernel/sched.c index 1b59e265273b..8f977d5cc515 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5031,17 +5031,16 @@ void account_idle_time(cputime_t cputime) */ void account_process_tick(struct task_struct *p, int user_tick) { - cputime_t one_jiffy = jiffies_to_cputime(1); - cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy); + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); struct rq *rq = this_rq(); if (user_tick) - account_user_time(p, one_jiffy, one_jiffy_scaled); + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) - account_system_time(p, HARDIRQ_OFFSET, one_jiffy, + account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, one_jiffy_scaled); else - account_idle_time(one_jiffy); + account_idle_time(cputime_one_jiffy); } /* -- cgit v1.2.3 From ed8d9adf357ec331603fa1049510399812cea7e5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 3 Aug 2009 14:08:48 +0900 Subject: x86, percpu: Add 'percpu_read_stable()' interface for cacheable accesses This is very useful for some common things like 'get_current()' and 'get_thread_info()', which can be used multiple times in a function, and where the result is cacheable. tj: Added the magical undocumented "P" modifier to UP __percpu_arg() to force gcc to dereference the pointer value passed in via the "p" input constraint. Without this, percpu_read_stable() returns the address of the percpu variable. Also added comment explaining the difference between percpu_read() and percpu_read_stable(). Signed-off-by: Linus Torvalds Signed-off-by: Tejun Heo Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/current.h | 2 +- arch/x86/include/asm/percpu.h | 26 +++++++++++++++++++------- arch/x86/include/asm/thread_info.h | 2 +- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index c68c361697e1..4d447b732d82 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -11,7 +11,7 @@ DECLARE_PER_CPU(struct task_struct *, current_task); static __always_inline struct task_struct *get_current(void) { - return percpu_read(current_task); + return percpu_read_stable(current_task); } #define current get_current() diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 103f1ddb0d85..04eacefcfd26 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -49,7 +49,7 @@ #define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x #define __my_cpu_offset percpu_read(this_cpu_off) #else -#define __percpu_arg(x) "%" #x +#define __percpu_arg(x) "%P" #x #endif /* @@ -104,36 +104,48 @@ do { \ } \ } while (0) -#define percpu_from_op(op, var) \ +#define percpu_from_op(op, var, constraint) \ ({ \ typeof(var) ret__; \ switch (sizeof(var)) { \ case 1: \ asm(op "b "__percpu_arg(1)",%0" \ : "=q" (ret__) \ - : "m" (var)); \ + : constraint); \ break; \ case 2: \ asm(op "w "__percpu_arg(1)",%0" \ : "=r" (ret__) \ - : "m" (var)); \ + : constraint); \ break; \ case 4: \ asm(op "l "__percpu_arg(1)",%0" \ : "=r" (ret__) \ - : "m" (var)); \ + : constraint); \ break; \ case 8: \ asm(op "q "__percpu_arg(1)",%0" \ : "=r" (ret__) \ - : "m" (var)); \ + : constraint); \ break; \ default: __bad_percpu_size(); \ } \ ret__; \ }) -#define percpu_read(var) percpu_from_op("mov", per_cpu__##var) +/* + * percpu_read() makes gcc load the percpu variable every time it is + * accessed while percpu_read_stable() allows the value to be cached. + * percpu_read_stable() is more efficient and can be used if its value + * is guaranteed to be valid across cpus. The current users include + * get_current() and get_thread_info() both of which are actually + * per-thread variables implemented as per-cpu variables and thus + * stable for the duration of the respective task. + */ +#define percpu_read(var) percpu_from_op("mov", per_cpu__##var, \ + "m" (per_cpu__##var)) +#define percpu_read_stable(var) percpu_from_op("mov", per_cpu__##var, \ + "p" (&per_cpu__##var)) #define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val) #define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val) #define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index fad7d40b75f8..a1bb5a114bf2 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -213,7 +213,7 @@ DECLARE_PER_CPU(unsigned long, kernel_stack); static inline struct thread_info *current_thread_info(void) { struct thread_info *ti; - ti = (void *)(percpu_read(kernel_stack) + + ti = (void *)(percpu_read_stable(kernel_stack) + KERNEL_STACK_OFFSET - THREAD_SIZE); return ti; } -- cgit v1.2.3 From 3e352aa8ee2bd48f1a19c7742810b3a4a7ba605e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 3 Aug 2009 14:10:11 +0900 Subject: x86, percpu: Fix DECLARE/DEFINE_PER_CPU_PAGE_ALIGNED() DECLARE/DEFINE_PER_CPU_PAGE_ALIGNED() put percpu variables in .page_aligned section without adding any alignment restrictions. Currently, this doesn't cause any problem because all users of the macros have explicit page alignment and page-sized but it's much safer to enforce page alignment from the macros. After all, it's what they claim to do. Add __aligned(PAGE_SIZE) to DECLARE/DEFINE_PER_CPU_PAGE_ALIGNED() and drop explicit alignment from it users. Signed-off-by: Tejun Heo Cc: Ingo Molnar Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 3 +-- include/linux/percpu-defs.h | 8 +++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f1961c07af9a..12493c5485e5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1008,8 +1008,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { }; static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]) - __aligned(PAGE_SIZE); + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); /* May not be marked __init: used by software suspend */ void syscall_init(void) diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 68438e18fff4..afd5f8b7061f 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -69,11 +69,13 @@ /* * Declaration/definition used for per-CPU variables that must be page aligned. */ -#define DECLARE_PER_CPU_PAGE_ALIGNED(type, name) \ - DECLARE_PER_CPU_SECTION(type, name, ".page_aligned") +#define DECLARE_PER_CPU_PAGE_ALIGNED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, ".page_aligned") \ + __aligned(PAGE_SIZE) #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \ - DEFINE_PER_CPU_SECTION(type, name, ".page_aligned") + DEFINE_PER_CPU_SECTION(type, name, ".page_aligned") \ + __aligned(PAGE_SIZE) /* * Intermodule exports for per-CPU variables. -- cgit v1.2.3 From bdf977b37418cdf8a2252504779a7e12a09b7575 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 3 Aug 2009 14:12:19 +0900 Subject: x86, percpu: Collect hot percpu variables into one cacheline On x86_64, percpu variables current_task and kernel_stack are used for get_current() and current_thread_info() respectively and thus are often used close to each other. Move definition of current_task to kernel/cpu/common.c right above kernel_stack definition and align it to cacheline so that they always fall into the same cacheline. Two percpu variables defined there together - irq_stack_ptr and irq_count - are also pretty hot and will benefit from sharing the cacheline. For consistency, current_task definition for x86_32 is also moved to kernel/cpu/common.c. Putting current_task and kernel_stack into the same cacheline was suggested by Linus Torvalds. Signed-off-by: Tejun Heo Cc: Linus Torvalds Cc: Ingo Molnar Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 15 +++++++++++++-- arch/x86/kernel/process_32.c | 3 --- arch/x86/kernel/process_64.c | 3 --- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 12493c5485e5..1bd88ed978bd 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -987,13 +987,21 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE); -DEFINE_PER_CPU(char *, irq_stack_ptr) = - init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; +/* + * The following four percpu variables are hot. Align current_task to + * cacheline size such that all four fall in the same cacheline. + */ +DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = + &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(unsigned long, kernel_stack) = (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; EXPORT_PER_CPU_SYMBOL(kernel_stack); +DEFINE_PER_CPU(char *, irq_stack_ptr) = + init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; + DEFINE_PER_CPU(unsigned int, irq_count) = -1; /* @@ -1041,6 +1049,9 @@ DEFINE_PER_CPU(struct orig_ist, orig_ist); #else /* CONFIG_X86_64 */ +DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); + #ifdef CONFIG_CC_STACKPROTECTOR DEFINE_PER_CPU(unsigned long, stack_canary); #endif diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 59f4524984af..daa4107be3b4 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -61,9 +61,6 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); -DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); - /* * Return saved PC of a blocked thread. */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ebefb5407b9d..c4c675d5ba1a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -55,9 +55,6 @@ asmlinkage extern void ret_from_fork(void); -DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); - DEFINE_PER_CPU(unsigned long, old_rsp); static DEFINE_PER_CPU(unsigned char, is_idle); -- cgit v1.2.3 From c7bd0414d681706a32105895cae20fb9090db52e Mon Sep 17 00:00:00 2001 From: Frans Pop Date: Thu, 23 Jul 2009 20:56:27 +0200 Subject: x86: Simplify the Makefile in a minor way through use of cc-ifversion Signed-off-by: Frans Pop Acked-by: Sam Ravnborg Reviewed-by: WANG Cong LKML-Reference: <200907232056.28635.elendil@planet.nl> Signed-off-by: Ingo Molnar --- arch/x86/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1b68659c41b4..1f3851a626d3 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -32,8 +32,8 @@ ifeq ($(CONFIG_X86_32),y) # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use # a lot more stack due to the lack of sharing of stacklots: - KBUILD_CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then \ - echo $(call cc-option,-fno-unit-at-a-time); fi ;) + KBUILD_CFLAGS += $(call cc-ifversion, -lt, 0400, \ + $(call cc-option,-fno-unit-at-a-time)) # CPU-specific tuning. Anything which can be shared with UML should go here. include $(srctree)/arch/x86/Makefile_32.cpu -- cgit v1.2.3 From 5b0f437df0a3e374d26ad533eb78fe64744f55a8 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 30 Jul 2009 19:00:53 +0200 Subject: workqueues: Improve schedule_work() documentation Two important aspects of the schedule_work() function are not yet documented: - that it is allowed to pass a struct work_struct * to this function that is already on the kernel-global workqueue; - the meaning of its return value. The patch below documents both aspects. Signed-off-by: Bart Van Assche Cc: "Greg Kroah-Hartman" Cc: Andrew Morton LKML-Reference: <200907301900.54202.bart.vanassche@gmail.com> Signed-off-by: Ingo Molnar --- kernel/workqueue.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0668795d8818..3c44b56b0da7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -600,7 +600,12 @@ static struct workqueue_struct *keventd_wq __read_mostly; * schedule_work - put work task in global workqueue * @work: job to be done * - * This puts a job in the kernel-global workqueue. + * Returns zero if @work was already on the kernel-global workqueue and + * non-zero otherwise. + * + * This puts a job in the kernel-global workqueue if it was not already + * queued and leaves it in the same position on the kernel-global + * workqueue otherwise. */ int schedule_work(struct work_struct *work) { -- cgit v1.2.3 From 54a0bf3c2cad3fd118ea725f26a493aece6ea01d Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 4 Aug 2009 15:52:38 +0200 Subject: Revert "x86: oprofile/op_model_amd.c set return values for op_amd_handle_ibs()" This reverts commit 21e70878215f620fe99ea7d7c74bc641aeec932f. Instead Andrew's patch will be applied he posted at the same time. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 827beecb67a4..37d19c768d5f 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -195,7 +195,7 @@ op_amd_handle_ibs(struct pt_regs * const regs, struct op_entry entry; if (!has_ibs) - return 0; + return 1; if (ibs_config.fetch_enabled) { rdmsrl(MSR_AMD64_IBSFETCHCTL, ctl); @@ -277,10 +277,7 @@ static void op_amd_stop_ibs(void) #else static inline int op_amd_handle_ibs(struct pt_regs * const regs, - struct op_msrs const * const msrs) -{ - return 0; -} + struct op_msrs const * const msrs) { } static inline void op_amd_start_ibs(void) { } static inline void op_amd_stop_ibs(void) { } -- cgit v1.2.3 From cc6db4e60116c1f76577b6850a35ae7de69a95b6 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 31 Jul 2009 16:20:10 -0700 Subject: futex: Correct futex_wait_requeue_pi() commentary The state machine described in the comments wasn't updated with a follow-on fix. Address that and cleanup the corresponding commentary in the function. Signed-off-by: Darren Hart Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner LKML-Reference: <4A737C2A.9090001@us.ibm.com> Signed-off-by: Ingo Molnar --- kernel/futex.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 0672ff88f159..d077201b393d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2102,11 +2102,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * We call schedule in futex_wait_queue_me() when we enqueue and return there * via the following: * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() - * 2) wakeup on uaddr2 after a requeue and subsequent unlock - * 3) signal (before or after requeue) - * 4) timeout (before or after requeue) + * 2) wakeup on uaddr2 after a requeue + * 3) signal + * 4) timeout * - * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function. + * If 3, cleanup and return -ERESTARTNOINTR. * * If 2, we may then block on trying to take the rt_mutex and return via: * 5) successful lock @@ -2114,7 +2114,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * 7) timeout * 8) other lock acquisition failure * - * If 6, we setup a restart_block with futex_lock_pi() as the function. + * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). * * If 4 or 7, we cleanup and return with -ETIMEDOUT. * @@ -2232,14 +2232,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, rt_mutex_unlock(pi_mutex); } else if (ret == -EINTR) { /* - * We've already been requeued, but we have no way to - * restart by calling futex_lock_pi() directly. We - * could restart the syscall, but that will look at - * the user space value and return right away. So we - * drop back with EWOULDBLOCK to tell user space that - * "val" has been changed. That's the same what the - * restart of the syscall would do in - * futex_wait_setup(). + * We've already been requeued, but cannot restart by calling + * futex_lock_pi() directly. We could restart this syscall, but + * it would detect that the user space "val" changed and return + * -EWOULDBLOCK. Save the overhead of the restart and return + * -EWOULDBLOCK directly. */ ret = -EWOULDBLOCK; } -- cgit v1.2.3 From 4680e64a88c4ce2c4e736dade99233e3def13fa7 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 23 Jun 2009 12:36:08 -0700 Subject: arch/x86/oprofile/op_model_amd.c: fix op_amd_handle_ibs() return type arch/x86/oprofile/op_model_amd.c: In function 'op_amd_handle_ibs': arch/x86/oprofile/op_model_amd.c:217: warning: no return statement in function returning non-void Fix this by making op_amd_handle_ibs() return void. Cc: Robert Richter Signed-off-by: Andrew Morton Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 37d19c768d5f..39686c29f03a 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -187,7 +187,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, #ifdef CONFIG_OPROFILE_IBS -static inline int +static inline void op_amd_handle_ibs(struct pt_regs * const regs, struct op_msrs const * const msrs) { @@ -195,7 +195,7 @@ op_amd_handle_ibs(struct pt_regs * const regs, struct op_entry entry; if (!has_ibs) - return 1; + return; if (ibs_config.fetch_enabled) { rdmsrl(MSR_AMD64_IBSFETCHCTL, ctl); @@ -241,8 +241,6 @@ op_amd_handle_ibs(struct pt_regs * const regs, wrmsrl(MSR_AMD64_IBSOPCTL, ctl); } } - - return 1; } static inline void op_amd_start_ibs(void) @@ -276,7 +274,7 @@ static void op_amd_stop_ibs(void) #else -static inline int op_amd_handle_ibs(struct pt_regs * const regs, +static inline void op_amd_handle_ibs(struct pt_regs * const regs, struct op_msrs const * const msrs) { } static inline void op_amd_start_ibs(void) { } static inline void op_amd_stop_ibs(void) { } -- cgit v1.2.3 From 990a55eb25d9698d61352264cc43f3a9c04cce90 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 31 Jul 2009 00:02:06 +0200 Subject: irq: Clean up by removing irqfixup MODULE_PARM_DESC() It's wrong (the parm takes no arguments) and compile-time wiped away anyway (due to !MODULE). Signed-off-by: Jiri Slaby LKML-Reference: <1248991326-26267-1-git-send-email-jirislaby@gmail.com> Signed-off-by: Ingo Molnar --- kernel/irq/spurious.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 4d568294de3e..114e704760fe 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -297,7 +297,6 @@ static int __init irqfixup_setup(char *str) __setup("irqfixup", irqfixup_setup); module_param(irqfixup, int, 0644); -MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode, 2: irqpoll mode"); static int __init irqpoll_setup(char *str) { -- cgit v1.2.3 From 97fd9ed48ce2b807edc363bef3e817aeeb5cd5e6 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 21 Jul 2009 20:25:05 +0200 Subject: timers: Cache __next_timer_interrupt result Each time a cpu goes to sleep on a NOHZ=y system the timer wheel is searched for the next timer interrupt. It can take quite a few cycles to find the next pending timer. This patch adds a field to tvec_base that caches the result of __next_timer_interrupt. The hit ratio is around 80% on my thinkpad under normal use, on a server I've seen hit ratios from 5% to 95% dependent on the workload. -v2: jiffies wrap fixes Signed-off-by: Martin Schwidefsky Acked-by: Thomas Gleixner Cc: john stultz Cc: Venki Pallipadi LKML-Reference: <20090721202505.7d56a079@skybase> Signed-off-by: Ingo Molnar --- kernel/timer.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/kernel/timer.c b/kernel/timer.c index 0b36b9e5cc8b..5c1e49ec2f1b 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -72,6 +72,7 @@ struct tvec_base { spinlock_t lock; struct timer_list *running_timer; unsigned long timer_jiffies; + unsigned long next_timer; struct tvec_root tv1; struct tvec tv2; struct tvec tv3; @@ -622,6 +623,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, if (timer_pending(timer)) { detach_timer(timer, 0); + if (timer->expires == base->next_timer && + !tbase_get_deferrable(timer->base)) + base->next_timer = base->timer_jiffies; ret = 1; } else { if (pending_only) @@ -663,6 +667,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, } timer->expires = expires; + if (time_before(timer->expires, base->next_timer) && + !tbase_get_deferrable(timer->base)) + base->next_timer = timer->expires; internal_add_timer(base, timer); out_unlock: @@ -781,6 +788,9 @@ void add_timer_on(struct timer_list *timer, int cpu) spin_lock_irqsave(&base->lock, flags); timer_set_base(timer, base); debug_timer_activate(timer); + if (time_before(timer->expires, base->next_timer) && + !tbase_get_deferrable(timer->base)) + base->next_timer = timer->expires; internal_add_timer(base, timer); /* * Check whether the other CPU is idle and needs to be @@ -817,6 +827,9 @@ int del_timer(struct timer_list *timer) base = lock_timer_base(timer, &flags); if (timer_pending(timer)) { detach_timer(timer, 1); + if (timer->expires == base->next_timer && + !tbase_get_deferrable(timer->base)) + base->next_timer = base->timer_jiffies; ret = 1; } spin_unlock_irqrestore(&base->lock, flags); @@ -850,6 +863,9 @@ int try_to_del_timer_sync(struct timer_list *timer) ret = 0; if (timer_pending(timer)) { detach_timer(timer, 1); + if (timer->expires == base->next_timer && + !tbase_get_deferrable(timer->base)) + base->next_timer = base->timer_jiffies; ret = 1; } out: @@ -1134,7 +1150,9 @@ unsigned long get_next_timer_interrupt(unsigned long now) unsigned long expires; spin_lock(&base->lock); - expires = __next_timer_interrupt(base); + if (time_before_eq(base->next_timer, base->timer_jiffies)) + base->next_timer = __next_timer_interrupt(base); + expires = base->next_timer; spin_unlock(&base->lock); if (time_before_eq(expires, now)) @@ -1523,6 +1541,7 @@ static int __cpuinit init_timers_cpu(int cpu) INIT_LIST_HEAD(base->tv1.vec + j); base->timer_jiffies = jiffies; + base->next_timer = base->timer_jiffies; return 0; } @@ -1535,6 +1554,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea timer = list_first_entry(head, struct timer_list, entry); detach_timer(timer, 0); timer_set_base(timer, new_base); + if (time_before(timer->expires, new_base->next_timer) && + !tbase_get_deferrable(timer->base)) + new_base->next_timer = timer->expires; internal_add_timer(new_base, timer); } } -- cgit v1.2.3 From 2977fb3ffc8493a2f4f0a362e8660a6cde9f1bb9 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 1 Aug 2009 11:47:59 +0400 Subject: x86, ioapic: Introduce for_each_irq_pin() helper This allow us to save a few lines of code. Signed-off-by: Cyrill Gorcunov Cc: yinghai@kernel.org LKML-Reference: <20090801075435.597863129@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 43 +++++++++++++++--------------------------- 1 file changed, 15 insertions(+), 28 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 7e92a9212fd7..ffd8fdfcbe4c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -66,6 +66,8 @@ #include #define __apicdebuginit(type) static type __init +#define for_each_irq_pin(entry, head) \ + for (entry = head; entry; entry = entry->next) /* * Is the SiS APIC rmw bug present ? @@ -410,7 +412,7 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); - for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { + for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; int pin; @@ -490,22 +492,21 @@ static void ioapic_mask_entry(int apic, int pin) */ static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) { - struct irq_pin_list **entryp, *entry; + struct irq_pin_list **last, *entry; - for (entryp = &cfg->irq_2_pin; - *entryp != NULL; - entryp = &(*entryp)->next) { - entry = *entryp; - /* not again, please */ + /* don't allow duplicates */ + last = &cfg->irq_2_pin; + for_each_irq_pin(entry, cfg->irq_2_pin) { if (entry->apic == apic && entry->pin == pin) return; + last = &entry->next; } entry = get_one_free_irq_2_pin(node); entry->apic = apic; entry->pin = pin; - *entryp = entry; + *last = entry; } /* @@ -517,7 +518,7 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, { struct irq_pin_list *entry; - for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { + for_each_irq_pin(entry, cfg->irq_2_pin) { if (entry->apic == oldapic && entry->pin == oldpin) { entry->apic = newapic; entry->pin = newpin; @@ -537,7 +538,7 @@ static void io_apic_modify_irq(struct irq_cfg *cfg, int pin; struct irq_pin_list *entry; - for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { + for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; pin = entry->pin; reg = io_apic_read(entry->apic, 0x10 + pin * 2); @@ -1669,12 +1670,8 @@ __apicdebuginit(void) print_IO_APIC(void) if (!entry) continue; printk(KERN_DEBUG "IRQ%d ", irq); - for (;;) { + for_each_irq_pin(entry, cfg->irq_2_pin) printk("-> %d:%d", entry->apic, entry->pin); - if (!entry->next) - break; - entry = entry->next; - } printk("\n"); } @@ -2227,7 +2224,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq struct irq_pin_list *entry; u8 vector = cfg->vector; - for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { + for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; apic = entry->apic; @@ -2556,20 +2553,10 @@ static void ack_apic_level(unsigned int irq) #ifdef CONFIG_INTR_REMAP static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) { - int apic, pin; struct irq_pin_list *entry; - entry = cfg->irq_2_pin; - for (;;) { - - if (!entry) - break; - - apic = entry->apic; - pin = entry->pin; - io_apic_eoi(apic, pin); - entry = entry->next; - } + for_each_irq_pin(entry, cfg->irq_2_pin) + io_apic_eoi(entry->apic, entry->pin); } static void -- cgit v1.2.3 From a7428cd2ef77734465e36bceb43290e37e2a97c6 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 1 Aug 2009 11:48:00 +0400 Subject: x86, ioapic: Throw BUG instead of NULL dereference Instead of plain NULL deref we better throw error message with a backtrace. Actually we need more gracious error handling here. Meanwhile leave it as is. Signed-off-by: Cyrill Gorcunov Cc: yinghai@kernel.org LKML-Reference: <20090801075435.769301745@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ffd8fdfcbe4c..2a145d3a8375 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -503,6 +503,10 @@ static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin } entry = get_one_free_irq_2_pin(node); + if (!entry) { + printk(KERN_ERR "can not alloc irq_pin_list\n"); + BUG_ON(1); + } entry->apic = apic; entry->pin = pin; -- cgit v1.2.3 From 9910887af84e33ba98fd6792029470ae80166208 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 23 Jul 2009 00:52:59 +0400 Subject: x86, apic: Drop redundant bit assignment cpu_has_apic has already investigated boot_cpu_data X86_FEATURE_APIC bit for being clear if condition is triggered. So there is no need to clear this bit second time. Signed-off-by: Cyrill Gorcuno v Cc: "Maciej W. Rozycki" LKML-Reference: <20090722205259.GE15805@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 0a1c2830ec66..0b021c56e822 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1651,7 +1651,6 @@ int __init APIC_init_uniprocessor(void) APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { pr_err("BIOS bug, local APIC 0x%x not detected!...\n", boot_cpu_physical_apicid); - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); return -1; } #endif -- cgit v1.2.3 From ce69a784504222c3ab6f1b3c357d09ec5772127a Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 20 Jul 2009 15:24:17 +0300 Subject: x86/apic: Enable x2APIC without interrupt remapping under KVM KVM would like to provide x2APIC interface to a guest without emulating interrupt remapping device. The reason KVM prefers guest to use x2APIC is that x2APIC interface is better virtualizable and provides better performance than mmio xAPIC interface: - msr exits are faster than mmio (no page table walk, emulation) - no need to read back ICR to look at the busy bit - one 64 bit ICR write instead of two 32 bit writes - shared code with the Hyper-V paravirt interface Included patch changes x2APIC enabling logic to enable it even if IR initialization failed, but kernel runs under KVM and no apic id is greater than 255 (if there is one spec requires BIOS to move to x2apic mode before starting an OS). -v2: fix build -v3: fix bug causing compiler warning Signed-off-by: Gleb Natapov Acked-by: Suresh Siddha Cc: Sheng Yang Cc: "avi@redhat.com" LKML-Reference: <20090720122417.GR5638@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 7 ++++ arch/x86/kernel/apic/apic.c | 83 +++++++++++++++++++++++------------------ arch/x86/kernel/apic/probe_64.c | 6 +-- 3 files changed, 56 insertions(+), 40 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index bb7d47925847..586b7adb8e53 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -183,6 +183,10 @@ static inline int x2apic_enabled(void) } #define x2apic_supported() (cpu_has_x2apic) +static inline void x2apic_force_phys(void) +{ + x2apic_phys = 1; +} #else static inline void check_x2apic(void) { @@ -194,6 +198,9 @@ static inline int x2apic_enabled(void) { return 0; } +static inline void x2apic_force_phys(void) +{ +} #define x2apic_preenabled 0 #define x2apic_supported() 0 diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 0b021c56e822..de039fcdd053 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -49,6 +49,7 @@ #include #include #include +#include unsigned int num_processors; @@ -1361,52 +1362,76 @@ void enable_x2apic(void) } #endif /* CONFIG_X86_X2APIC */ -void __init enable_IR_x2apic(void) +int __init enable_IR(void) { #ifdef CONFIG_INTR_REMAP int ret; - unsigned long flags; - struct IO_APIC_route_entry **ioapic_entries = NULL; ret = dmar_table_init(); if (ret) { pr_debug("dmar_table_init() failed with %d:\n", ret); - goto ir_failed; + return 0; } if (!intr_remapping_supported()) { pr_debug("intr-remapping not supported\n"); - goto ir_failed; + return 0; } - if (!x2apic_preenabled && skip_ioapic_setup) { pr_info("Skipped enabling intr-remap because of skipping " "io-apic setup\n"); - return; + return 0; } + if (enable_intr_remapping(x2apic_supported())) + return 0; + + pr_info("Enabled Interrupt-remapping\n"); + + return 1; + +#endif + return 0; +} + +void __init enable_IR_x2apic(void) +{ + unsigned long flags; + struct IO_APIC_route_entry **ioapic_entries = NULL; + int ret, x2apic_enabled = 0; + ioapic_entries = alloc_ioapic_entries(); if (!ioapic_entries) { - pr_info("Allocate ioapic_entries failed: %d\n", ret); - goto end; + pr_err("Allocate ioapic_entries failed\n"); + goto out; } ret = save_IO_APIC_setup(ioapic_entries); if (ret) { pr_info("Saving IO-APIC state failed: %d\n", ret); - goto end; + goto out; } local_irq_save(flags); - mask_IO_APIC_setup(ioapic_entries); mask_8259A(); + mask_IO_APIC_setup(ioapic_entries); - ret = enable_intr_remapping(x2apic_supported()); - if (ret) - goto end_restore; + ret = enable_IR(); + if (!ret) { + /* IR is required if there is APIC ID > 255 even when running + * under KVM + */ + if (max_physical_apicid > 255 || !kvm_para_available()) + goto nox2apic; + /* + * without IR all CPUs can be addressed by IOAPIC/MSI + * only in physical mode + */ + x2apic_force_phys(); + } - pr_info("Enabled Interrupt-remapping\n"); + x2apic_enabled = 1; if (x2apic_supported() && !x2apic_mode) { x2apic_mode = 1; @@ -1414,41 +1439,25 @@ void __init enable_IR_x2apic(void) pr_info("Enabled x2apic\n"); } -end_restore: - if (ret) - /* - * IR enabling failed - */ +nox2apic: + if (!ret) /* IR enabling failed */ restore_IO_APIC_setup(ioapic_entries); - unmask_8259A(); local_irq_restore(flags); -end: +out: if (ioapic_entries) free_ioapic_entries(ioapic_entries); - if (!ret) + if (x2apic_enabled) return; -ir_failed: if (x2apic_preenabled) - panic("x2apic enabled by bios. But IR enabling failed"); + panic("x2apic: enabled by BIOS but kernel init failed."); else if (cpu_has_x2apic) - pr_info("Not enabling x2apic,Intr-remapping\n"); -#else - if (!cpu_has_x2apic) - return; - - if (x2apic_preenabled) - panic("x2apic enabled prior OS handover," - " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP"); -#endif - - return; + pr_info("Not enabling x2apic, Intr-remapping init failed.\n"); } - #ifdef CONFIG_X86_64 /* * Detect and enable local APICs on non-SMP boards. diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index bc3e880f9b82..f3b1037076e4 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -50,11 +50,11 @@ static struct apic *apic_probe[] __initdata = { void __init default_setup_apic_routing(void) { #ifdef CONFIG_X86_X2APIC - if (x2apic_mode && (apic != &apic_x2apic_phys && + if (x2apic_mode #ifdef CONFIG_X86_UV - apic != &apic_x2apic_uv_x && + && apic != &apic_x2apic_uv_x #endif - apic != &apic_x2apic_cluster)) { + ) { if (x2apic_phys) apic = &apic_x2apic_phys; else -- cgit v1.2.3 From 0c9e6f639aed490202bbc79214f4495cf4bfde58 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 28 Jul 2009 20:26:06 +0800 Subject: tracing: Simplify print_graph_cpu() print_graph_cpu() is little over-designed. And "log10_all" may be wrong when there are holes in cpu_online_mask: the max online cpu id > cpumask_weight(cpu_online_mask) So change it by using a static column length for the cpu matching nr_cpu_ids number of decimal characters. Signed-off-by: Lai Jiangshan Cc: Steven Rostedt LKML-Reference: <4A6EEE5E.2000001@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_functions_graph.c | 30 ++++-------------------------- 1 file changed, 4 insertions(+), 26 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index abf7c4ae2c8b..e30472da15d5 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -183,43 +183,19 @@ static void graph_trace_reset(struct trace_array *tr) unregister_ftrace_graph(); } -static inline int log10_cpu(int nb) -{ - if (nb / 100) - return 3; - if (nb / 10) - return 2; - return 1; -} +static int max_bytes_for_cpu; static enum print_line_t print_graph_cpu(struct trace_seq *s, int cpu) { - int i; int ret; - int log10_this = log10_cpu(cpu); - int log10_all = log10_cpu(cpumask_weight(cpu_online_mask)); - /* * Start with a space character - to make it stand out * to the right a bit when trace output is pasted into * email: */ - ret = trace_seq_printf(s, " "); - - /* - * Tricky - we space the CPU field according to the max - * number of online CPUs. On a 2-cpu system it would take - * a maximum of 1 digit - on a 128 cpu system it would - * take up to 3 digits: - */ - for (i = 0; i < log10_all - log10_this; i++) { - ret = trace_seq_printf(s, " "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - ret = trace_seq_printf(s, "%d) ", cpu); + ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -919,6 +895,8 @@ static struct tracer graph_trace __read_mostly = { static __init int init_graph_trace(void) { + max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); + return register_tracer(&graph_trace); } -- cgit v1.2.3 From 07868b086cca784f4b532fc2ab574ec3a73b468a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 29 Jul 2009 03:33:51 +0200 Subject: tracing/function-graph-tracer: Drop the useless nmi protection The function graph tracer used to have a protection against NMI while entering a function entry tracing. But this is useless now, this tracer is reentrant and the ring buffer supports the NMI tracing. We can then drop this protection. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt --- arch/x86/kernel/ftrace.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index d94e1ea3b9fe..8e9663413b7f 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -417,10 +417,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, unsigned long return_hooker = (unsigned long) &return_to_handler; - /* Nmi's are currently unsupported */ - if (unlikely(in_nmi())) - return; - if (unlikely(atomic_read(¤t->tracing_graph_pause))) return; -- cgit v1.2.3 From 5e5bf483986ad86ad25f25eec5299c86eb2d1c57 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 29 Jul 2009 17:11:12 +0200 Subject: tracing/core: Turn ftrace_cpu_disabled into a global var In order to prepare the moving of the function graph tracer insertion helpers from trace.c to trace_functions_graph.c, we need to export the ftrace_cpu_disabled variable. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt --- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 38a4a3ee749d..b6211d604131 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -89,7 +89,7 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set) */ static int tracing_disabled = 1; -static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); +DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); static inline void ftrace_disable_cpu(void) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 758b0dbed552..c7e92732982d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -519,6 +519,7 @@ extern int DYN_FTRACE_TEST_NAME(void); extern int ring_buffer_expanded; extern bool tracing_selftest_disabled; +DECLARE_PER_CPU(local_t, ftrace_cpu_disabled); #ifdef CONFIG_FTRACE_STARTUP_TEST extern int trace_selftest_startup_function(struct tracer *trace, -- cgit v1.2.3 From c0a0d0d3f65284c71115a9bb1ed801ee33eeb552 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 29 Jul 2009 17:51:13 +0200 Subject: tracing/core: Make the stack entry helpers global Make the stacktrace event insertion helpers globals. This has two effects: - Prepare for moving the sched events insertion helpers to the sched switch tracer file. - Move some ifdef outside function definitions Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt --- kernel/trace/trace.c | 24 ++++++++---------------- kernel/trace/trace.h | 28 +++++++++++++++++++++++++--- 2 files changed, 33 insertions(+), 19 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b6211d604131..d6059a493e7f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -866,10 +866,6 @@ struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, return event; } -static void ftrace_trace_stack(struct trace_array *tr, - unsigned long flags, int skip, int pc); -static void ftrace_trace_userstack(struct trace_array *tr, - unsigned long flags, int pc); static inline void __trace_buffer_unlock_commit(struct trace_array *tr, struct ring_buffer_event *event, @@ -1003,11 +999,11 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, trace_function(tr, ip, parent_ip, flags, pc); } +#ifdef CONFIG_STACKTRACE static void __ftrace_trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc) { -#ifdef CONFIG_STACKTRACE struct ftrace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; struct stack_entry *entry; @@ -1028,12 +1024,10 @@ static void __ftrace_trace_stack(struct trace_array *tr, save_stack_trace(&trace); if (!filter_check_discard(call, entry, tr->buffer, event)) ring_buffer_unlock_commit(tr->buffer, event); -#endif } -static void ftrace_trace_stack(struct trace_array *tr, - unsigned long flags, - int skip, int pc) +void ftrace_trace_stack(struct trace_array *tr, unsigned long flags, int skip, + int pc) { if (!(trace_flags & TRACE_ITER_STACKTRACE)) return; @@ -1041,17 +1035,14 @@ static void ftrace_trace_stack(struct trace_array *tr, __ftrace_trace_stack(tr, flags, skip, pc); } -void __trace_stack(struct trace_array *tr, - unsigned long flags, - int skip, int pc) +void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, + int pc) { __ftrace_trace_stack(tr, flags, skip, pc); } -static void ftrace_trace_userstack(struct trace_array *tr, - unsigned long flags, int pc) +void ftrace_trace_userstack(struct trace_array *tr, unsigned long flags, int pc) { -#ifdef CONFIG_STACKTRACE struct ftrace_event_call *call = &event_user_stack; struct ring_buffer_event *event; struct userstack_entry *entry; @@ -1076,7 +1067,6 @@ static void ftrace_trace_userstack(struct trace_array *tr, save_stack_trace_user(&trace); if (!filter_check_discard(call, entry, tr->buffer, event)) ring_buffer_unlock_commit(tr->buffer, event); -#endif } #ifdef UNUSED @@ -1086,6 +1076,8 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) } #endif /* UNUSED */ +#endif /* CONFIG_STACKTRACE */ + static void ftrace_trace_special(void *__tr, unsigned long arg1, unsigned long arg2, unsigned long arg3, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c7e92732982d..116524d62366 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -489,9 +489,31 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); -void __trace_stack(struct trace_array *tr, - unsigned long flags, - int skip, int pc); +#ifdef CONFIG_STACKTRACE +void ftrace_trace_stack(struct trace_array *tr, unsigned long flags, + int skip, int pc); + +void ftrace_trace_userstack(struct trace_array *tr, unsigned long flags, + int pc); + +void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, + int pc); +#else +static inline void ftrace_trace_stack(struct trace_array *tr, + unsigned long flags, int skip, int pc) +{ +} + +static inline void ftrace_trace_userstack(struct trace_array *tr, + unsigned long flags, int pc) +{ +} + +static inline void __trace_stack(struct trace_array *tr, unsigned long flags, + int skip, int pc) +{ +} +#endif /* CONFIG_STACKTRACE */ extern cycle_t ftrace_now(int cpu); -- cgit v1.2.3 From 82e04af498a85ba425efe77580b7ba08234411df Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 29 Jul 2009 18:00:29 +0200 Subject: tracing: Move sched event insertion helpers in the sched switch tracer file The sched events helpers which insert the sched switch and wakeup events into the ring buffer currently reside in trace.c But this file is quite overloaded and the right place for these helpers is in the sched switch tracer file. Then move them to trace_functions.c Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt --- kernel/trace/trace.c | 56 -------------------------------------- kernel/trace/trace_sched_switch.c | 57 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 56 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d6059a493e7f..1b73acb40e56 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1105,62 +1105,6 @@ __trace_special(void *__tr, void *__data, ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count()); } -void -tracing_sched_switch_trace(struct trace_array *tr, - struct task_struct *prev, - struct task_struct *next, - unsigned long flags, int pc) -{ - struct ftrace_event_call *call = &event_context_switch; - struct ring_buffer_event *event; - struct ctx_switch_entry *entry; - - event = trace_buffer_lock_reserve(tr, TRACE_CTX, - sizeof(*entry), flags, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->prev_pid = prev->pid; - entry->prev_prio = prev->prio; - entry->prev_state = prev->state; - entry->next_pid = next->pid; - entry->next_prio = next->prio; - entry->next_state = next->state; - entry->next_cpu = task_cpu(next); - - if (!filter_check_discard(call, entry, tr->buffer, event)) - trace_buffer_unlock_commit(tr, event, flags, pc); -} - -void -tracing_sched_wakeup_trace(struct trace_array *tr, - struct task_struct *wakee, - struct task_struct *curr, - unsigned long flags, int pc) -{ - struct ftrace_event_call *call = &event_wakeup; - struct ring_buffer_event *event; - struct ctx_switch_entry *entry; - - event = trace_buffer_lock_reserve(tr, TRACE_WAKE, - sizeof(*entry), flags, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->prev_pid = curr->pid; - entry->prev_prio = curr->prio; - entry->prev_state = curr->state; - entry->next_pid = wakee->pid; - entry->next_prio = wakee->prio; - entry->next_state = wakee->state; - entry->next_cpu = task_cpu(wakee); - - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); - ftrace_trace_stack(tr, flags, 6, pc); - ftrace_trace_userstack(tr, flags, pc); -} - void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index a98106dd979c..e1285d7b5488 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -20,6 +20,34 @@ static int sched_ref; static DEFINE_MUTEX(sched_register_mutex); static int sched_stopped; + +void +tracing_sched_switch_trace(struct trace_array *tr, + struct task_struct *prev, + struct task_struct *next, + unsigned long flags, int pc) +{ + struct ftrace_event_call *call = &event_context_switch; + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; + + event = trace_buffer_lock_reserve(tr, TRACE_CTX, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->prev_pid = prev->pid; + entry->prev_prio = prev->prio; + entry->prev_state = prev->state; + entry->next_pid = next->pid; + entry->next_prio = next->prio; + entry->next_state = next->state; + entry->next_cpu = task_cpu(next); + + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, flags, pc); +} + static void probe_sched_switch(struct rq *__rq, struct task_struct *prev, struct task_struct *next) @@ -49,6 +77,35 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev, local_irq_restore(flags); } +void +tracing_sched_wakeup_trace(struct trace_array *tr, + struct task_struct *wakee, + struct task_struct *curr, + unsigned long flags, int pc) +{ + struct ftrace_event_call *call = &event_wakeup; + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; + + event = trace_buffer_lock_reserve(tr, TRACE_WAKE, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->prev_pid = curr->pid; + entry->prev_prio = curr->prio; + entry->prev_state = curr->state; + entry->next_pid = wakee->pid; + entry->next_prio = wakee->prio; + entry->next_state = wakee->state; + entry->next_cpu = task_cpu(wakee); + + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); + ftrace_trace_stack(tr, flags, 6, pc); + ftrace_trace_userstack(tr, flags, pc); +} + static void probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) { -- cgit v1.2.3 From 1a0799a8fef5acc6503f9c5e79b2cd003317826c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 29 Jul 2009 18:59:58 +0200 Subject: tracing/function-graph-tracer: Move graph event insertion helpers in the graph tracer file The function graph events helpers which insert the function entry and return events into the ring buffer currently reside in trace.c But this file is quite overloaded and the right place for these helpers is in the function graph tracer file. Then move them to trace_functions_graph.c Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt --- kernel/trace/trace.c | 110 ------------------------------- kernel/trace/trace.h | 1 + kernel/trace/trace_functions_graph.c | 122 ++++++++++++++++++++++++++++++++++- kernel/trace/trace_selftest.c | 1 + 4 files changed, 121 insertions(+), 113 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1b73acb40e56..0cfd1a62def1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -942,54 +942,6 @@ trace_function(struct trace_array *tr, ring_buffer_unlock_commit(tr->buffer, event); } -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int __trace_graph_entry(struct trace_array *tr, - struct ftrace_graph_ent *trace, - unsigned long flags, - int pc) -{ - struct ftrace_event_call *call = &event_funcgraph_entry; - struct ring_buffer_event *event; - struct ftrace_graph_ent_entry *entry; - - if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) - return 0; - - event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT, - sizeof(*entry), flags, pc); - if (!event) - return 0; - entry = ring_buffer_event_data(event); - entry->graph_ent = *trace; - if (!filter_current_check_discard(call, entry, event)) - ring_buffer_unlock_commit(global_trace.buffer, event); - - return 1; -} - -static void __trace_graph_return(struct trace_array *tr, - struct ftrace_graph_ret *trace, - unsigned long flags, - int pc) -{ - struct ftrace_event_call *call = &event_funcgraph_exit; - struct ring_buffer_event *event; - struct ftrace_graph_ret_entry *entry; - - if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) - return; - - event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_RET, - sizeof(*entry), flags, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->ret = *trace; - if (!filter_current_check_discard(call, entry, event)) - ring_buffer_unlock_commit(global_trace.buffer, event); -} -#endif - void ftrace(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, unsigned long parent_ip, unsigned long flags, @@ -1129,68 +1081,6 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) local_irq_restore(flags); } -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -int trace_graph_entry(struct ftrace_graph_ent *trace) -{ - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int ret; - int cpu; - int pc; - - if (!ftrace_trace_task(current)) - return 0; - - if (!ftrace_graph_addr(trace->func)) - return 0; - - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) { - pc = preempt_count(); - ret = __trace_graph_entry(tr, trace, flags, pc); - } else { - ret = 0; - } - /* Only do the atomic if it is not already set */ - if (!test_tsk_trace_graph(current)) - set_tsk_trace_graph(current); - - atomic_dec(&data->disabled); - local_irq_restore(flags); - - return ret; -} - -void trace_graph_return(struct ftrace_graph_ret *trace) -{ - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu; - int pc; - - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) { - pc = preempt_count(); - __trace_graph_return(tr, trace, flags, pc); - } - if (!trace->depth) - clear_tsk_trace_graph(current); - atomic_dec(&data->disabled); - local_irq_restore(flags); -} -#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - - /** * trace_vbprintk - write binary msg to tracing buffer * diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 116524d62366..9301f1263c5c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -471,6 +471,7 @@ void trace_function(struct trace_array *tr, void trace_graph_return(struct ftrace_graph_ret *trace); int trace_graph_entry(struct ftrace_graph_ent *trace); +void set_graph_array(struct trace_array *tr); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index e30472da15d5..f97244a41a4f 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -52,7 +52,7 @@ static struct tracer_flags tracer_flags = { .opts = trace_opts }; -/* pid on the last trace processed */ +static struct trace_array *graph_array; /* Add a function return address to the trace stack on thread info.*/ @@ -166,10 +166,121 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } +static int __trace_graph_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, + unsigned long flags, + int pc) +{ + struct ftrace_event_call *call = &event_funcgraph_entry; + struct ring_buffer_event *event; + struct ftrace_graph_ent_entry *entry; + + if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + return 0; + + event = trace_buffer_lock_reserve(tr, TRACE_GRAPH_ENT, + sizeof(*entry), flags, pc); + if (!event) + return 0; + entry = ring_buffer_event_data(event); + entry->graph_ent = *trace; + if (!filter_current_check_discard(call, entry, event)) + ring_buffer_unlock_commit(tr->buffer, event); + + return 1; +} + +int trace_graph_entry(struct ftrace_graph_ent *trace) +{ + struct trace_array *tr = graph_array; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int ret; + int cpu; + int pc; + + if (unlikely(!tr)) + return 0; + + if (!ftrace_trace_task(current)) + return 0; + + if (!ftrace_graph_addr(trace->func)) + return 0; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { + pc = preempt_count(); + ret = __trace_graph_entry(tr, trace, flags, pc); + } else { + ret = 0; + } + /* Only do the atomic if it is not already set */ + if (!test_tsk_trace_graph(current)) + set_tsk_trace_graph(current); + + atomic_dec(&data->disabled); + local_irq_restore(flags); + + return ret; +} + +static void __trace_graph_return(struct trace_array *tr, + struct ftrace_graph_ret *trace, + unsigned long flags, + int pc) +{ + struct ftrace_event_call *call = &event_funcgraph_exit; + struct ring_buffer_event *event; + struct ftrace_graph_ret_entry *entry; + + if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + return; + + event = trace_buffer_lock_reserve(tr, TRACE_GRAPH_RET, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->ret = *trace; + if (!filter_current_check_discard(call, entry, event)) + ring_buffer_unlock_commit(tr->buffer, event); +} + +void trace_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = graph_array; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + int pc; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { + pc = preempt_count(); + __trace_graph_return(tr, trace, flags, pc); + } + if (!trace->depth) + clear_tsk_trace_graph(current); + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + static int graph_trace_init(struct trace_array *tr) { - int ret = register_ftrace_graph(&trace_graph_return, - &trace_graph_entry); + int ret; + + graph_array = tr; + ret = register_ftrace_graph(&trace_graph_return, + &trace_graph_entry); if (ret) return ret; tracing_start_cmdline_record(); @@ -177,6 +288,11 @@ static int graph_trace_init(struct trace_array *tr) return 0; } +void set_graph_array(struct trace_array *tr) +{ + graph_array = tr; +} + static void graph_trace_reset(struct trace_array *tr) { tracing_stop_cmdline_record(); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 00dd6485bdd7..d2cdbabb4ead 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -288,6 +288,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, * to detect and recover from possible hangs */ tracing_reset_online_cpus(tr); + set_graph_array(tr); ret = register_ftrace_graph(&trace_graph_return, &trace_graph_entry_watchdog); if (ret) { -- cgit v1.2.3 From a2ca5e03b6a5a1d401062f0a7f78888cf9e5e3b0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 6 Aug 2009 07:32:21 +0200 Subject: tracing/events: Only define remove_subsystem_dir() if CONFIG_MODULES MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we disable modules, we get the following warning in ftrace events file: kernel/trace/trace_events.c:912: attention : ‘remove_subsystem_dir’ defined but not used remove_subystem_dir() is useless if !CONFIG_MODULES, then move it to the appropriate #ifdef section of trace_events.c Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt --- kernel/trace/trace_events.c | 52 ++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 90cf9360e140..70ecb7653b46 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -908,32 +908,6 @@ event_subsystem_dir(const char *name, struct dentry *d_events) return system->entry; } -static void remove_subsystem_dir(const char *name) -{ - struct event_subsystem *system; - - if (strcmp(name, TRACE_SYSTEM) == 0) - return; - - list_for_each_entry(system, &event_subsystems, list) { - if (strcmp(system->name, name) == 0) { - if (!--system->nr_events) { - struct event_filter *filter = system->filter; - - debugfs_remove_recursive(system->entry); - list_del(&system->list); - if (filter) { - kfree(filter->filter_string); - kfree(filter); - } - kfree(system->name); - kfree(system); - } - break; - } - } -} - static int event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, const struct file_operations *id, @@ -1018,6 +992,32 @@ struct ftrace_module_file_ops { struct file_operations filter; }; +static void remove_subsystem_dir(const char *name) +{ + struct event_subsystem *system; + + if (strcmp(name, TRACE_SYSTEM) == 0) + return; + + list_for_each_entry(system, &event_subsystems, list) { + if (strcmp(system->name, name) == 0) { + if (!--system->nr_events) { + struct event_filter *filter = system->filter; + + debugfs_remove_recursive(system->entry); + list_del(&system->list); + if (filter) { + kfree(filter->filter_string); + kfree(filter); + } + kfree(system->name); + kfree(system); + } + break; + } + } +} + static struct ftrace_module_file_ops * trace_create_file_ops(struct module *mod) { -- cgit v1.2.3 From f3d1915a8623b9248572d3ee44e19a80b7a3520b Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 6 Aug 2009 00:09:31 +0400 Subject: x86, ioapic: Panic on irq-pin binding only if needed Though the most time we are to panic on irq-pin allocation fails, for PCI interrupts it's not the case and we could continue operate even if irq-pin allocation failed. Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu LKML-Reference: <20090805200931.GB5319@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 2a145d3a8375..2999f3dd5889 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -490,7 +490,8 @@ static void ioapic_mask_entry(int apic, int pin) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +static int +add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin) { struct irq_pin_list **last, *entry; @@ -498,19 +499,27 @@ static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin last = &cfg->irq_2_pin; for_each_irq_pin(entry, cfg->irq_2_pin) { if (entry->apic == apic && entry->pin == pin) - return; + return 0; last = &entry->next; } entry = get_one_free_irq_2_pin(node); if (!entry) { - printk(KERN_ERR "can not alloc irq_pin_list\n"); - BUG_ON(1); + printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n", + node, apic, pin); + return -ENOMEM; } entry->apic = apic; entry->pin = pin; *last = entry; + return 0; +} + +static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +{ + if (add_pin_to_irq_node_nopanic(cfg, node, apic, pin)) + panic("IO-APIC: failed to add irq-pin. Can not proceed\n"); } /* @@ -3843,7 +3852,11 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq, */ if (irq >= NR_IRQS_LEGACY) { cfg = desc->chip_data; - add_pin_to_irq_node(cfg, node, ioapic, pin); + if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { + printk(KERN_INFO "can not add pin %d for irq %d\n", + pin, irq); + return 0; + } } setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity); -- cgit v1.2.3 From 00235fe25eba6d3a13f3349b2e3a2d94b699a414 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Wed, 5 Aug 2009 15:02:20 -0700 Subject: futex: Update woken requeued futex_q lock_ptr futex_requeue() can acquire the lock on behalf of a waiter during the requeue loop in the event of a lock steal or owner died. futex_wait_requeue_pi() cleans up the pi_state owner, using the lock_ptr to protect against concurrent access to the pi_state. The pi_state is found on the requeue target futex hash bucket so the lock_ptr needs to be updated accordingly. The problem manifested by triggering the WARN_ON in lookup_pi_state() about the pid != pi_state->owner pid. The astute reviewer will note that still exists a race between the time futex_requeue() releases hb2->lock() and the time when futex_wait_requeue_pi() acquires it. During this time the pi_state and the futex uaddr are not in sync with the rt_mutex ownership. This patch closes the window to the point where my tests now pass, but we still need to address it. Signed-off-by: Darren Hart Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Dinakar Guniguntala Cc: John Stultz Cc: John Kacur Cc: LKML-Reference: <4A7A016C.1090002@us.ibm.com> Signed-off-by: Ingo Molnar --- kernel/futex.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 0672ff88f159..57f5a80eef41 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1010,19 +1010,24 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue * q: the futex_q * key: the key of the requeue target futex + * hb: the hash_bucket of the requeue target futex * * During futex_requeue, with requeue_pi=1, it is possible to acquire the * target futex if it is uncontended or via a lock steal. Set the futex_q key * to the requeue target futex so the waiter can detect the wakeup on the right * futex, but remove it from the hb and NULL the rt_waiter so it can detect - * atomic lock acquisition. Must be called with the q->lock_ptr held. + * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock + * to protect access to the pi_state to fixup the owner later. Must be called + * with the q->lock_ptr held. */ static inline -void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) +void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, + struct futex_hash_bucket *hb) { drop_futex_key_refs(&q->key); get_futex_key_refs(key); q->key = *key; + q->lock_ptr = &hb->lock; WARN_ON(plist_node_empty(&q->list)); plist_del(&q->list, &q->list.plist); @@ -1088,7 +1093,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, set_waiters); if (ret == 1) - requeue_pi_wake_futex(top_waiter, key2); + requeue_pi_wake_futex(top_waiter, key2, hb2); return ret; } @@ -1273,7 +1278,7 @@ retry_private: this->task, 1); if (ret == 1) { /* We got the lock. */ - requeue_pi_wake_futex(this, &key2); + requeue_pi_wake_futex(this, &key2, hb2); continue; } else if (ret) { /* -EDEADLK */ -- cgit v1.2.3 From 1e5de18278e6862f4198412b5059a03770fa816a Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sun, 19 Jul 2009 00:12:20 +0900 Subject: x86: Introduce GDT_ENTRY_INIT() GDT_ENTRY_INIT is static initializer of desc_struct. We already have similar macro GDT_ENTRY() but it's static initializer for u64 and it cannot be used for desc_struct. Signed-off-by: Akinobu Mita LKML-Reference: <20090718151219.GD11294@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc_defs.h | 6 ++++++ arch/x86/include/asm/lguest.h | 5 +++-- arch/x86/include/asm/stackprotector.h | 2 +- arch/x86/kernel/apm_32.c | 2 +- arch/x86/kernel/cpu/common.c | 40 +++++++++++++++++------------------ drivers/pnp/pnpbios/bioscalls.c | 5 +---- 6 files changed, 32 insertions(+), 28 deletions(-) diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index a6adefa28b94..9d6684849fd9 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -34,6 +34,12 @@ struct desc_struct { }; } __attribute__((packed)); +#define GDT_ENTRY_INIT(flags, base, limit) { { { \ + .a = ((limit) & 0xffff) | (((base) & 0xffff) << 16), \ + .b = (((base) & 0xff0000) >> 16) | (((flags) & 0xf0ff) << 8) | \ + ((limit) & 0xf0000) | ((base) & 0xff000000), \ + } } } + enum { GATE_INTERRUPT = 0xE, GATE_TRAP = 0xF, diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 313389cd50d2..94cd69858b14 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h @@ -91,8 +91,9 @@ static inline void lguest_set_ts(void) } /* Full 4G segment descriptors, suitable for CS and DS. */ -#define FULL_EXEC_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9b00} } }) -#define FULL_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9300} } }) +#define FULL_EXEC_SEGMENT \ + ((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff)) +#define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff)) #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index cdc5e0b126a7..44efdff3975d 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -48,7 +48,7 @@ * head_32 for boot CPU and setup_per_cpu_areas() for others. */ #define GDT_STACK_CANARY_INIT \ - [GDT_ENTRY_STACK_CANARY] = { { { 0x00000018, 0x00409000 } } }, + [GDT_ENTRY_STACK_CANARY] = GDT_ENTRY_INIT(0x4090, 0, 0x18), /* * Initialize the stackprotector canary value. diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index b5e841bd60d9..febb2dab254f 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -403,7 +403,7 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); static struct apm_user *user_list; static DEFINE_SPINLOCK(user_list_lock); -static struct desc_struct bad_bios_desc = { { { 0, 0x00409200 } } }; +static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092, 0, 0); static const char driver_version[] = "1.16ac"; /* no spaces */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f1961c07af9a..8c9bc287f8fb 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -71,45 +71,45 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { * TLS descriptors are currently at a different place compared to i386. * Hopefully nobody expects them at a fixed place (Wine?) */ - [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, - [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff), #else - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff), /* * Segments used for calling PnP BIOS have byte granularity. * They code segments and data segments have fixed 64k limits, * the transfer segment sizes are set at run time. */ /* 32-bit code */ - [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, + [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), /* 16-bit code */ - [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, + [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff), /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0), /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0), /* * The APM segments have byte granularity and their bases * are set at run time. All have 64k limits. */ /* 32-bit code */ - [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, + [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), /* 16-bit code */ - [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, + [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), /* data */ - [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, + [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), - [GDT_ENTRY_ESPFIX_SS] = { { { 0x0000ffff, 0x00cf9200 } } }, - [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, + [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), GDT_STACK_CANARY_INIT #endif } }; diff --git a/drivers/pnp/pnpbios/bioscalls.c b/drivers/pnp/pnpbios/bioscalls.c index 45ad3e9cc362..bd035e3d3550 100644 --- a/drivers/pnp/pnpbios/bioscalls.c +++ b/drivers/pnp/pnpbios/bioscalls.c @@ -60,7 +60,7 @@ do { \ set_desc_limit(&gdt[(selname) >> 3], (size) - 1); \ } while(0) -static struct desc_struct bad_bios_desc; +static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092, 0, 0); /* * At some point we want to use this stack frame pointer to unwind @@ -476,9 +476,6 @@ void pnpbios_calls_init(union pnp_bios_install_struct *header) pnp_bios_callpoint.offset = header->fields.pm16offset; pnp_bios_callpoint.segment = PNP_CS16; - bad_bios_desc.a = 0; - bad_bios_desc.b = 0x00409200; - set_desc_base(&bad_bios_desc, (unsigned long)__va(0x40UL << 4)); set_desc_limit(&bad_bios_desc, 4095 - (0x40 << 4)); for_each_possible_cpu(i) { -- cgit v1.2.3 From 72c4d8530244264317a662de9a55cc47e6c8e9df Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 3 Aug 2009 08:47:07 +0200 Subject: x86: Introduce GDT_ENTRY_INIT(), fix APM This crash: [ 0.891983] calling cache_sysfs_init+0x0/0x1ee @ 1 [ 0.897251] initcall cache_sysfs_init+0x0/0x1ee returned 0 after 405 usecs [ 0.904019] calling mce_init_device+0x0/0x242 @ 1 [ 0.909124] initcall mce_init_device+0x0/0x242 returned 0 after 347 usecs [ 0.915815] calling apm_init+0x0/0x38d @ 1 [ 0.919967] apm: BIOS version 1.2 Flags 0x07 (Driver version 1.16ac) [ 0.926813] general protection fault: 0000 [#1] [ 0.927269] last sysfs file: [ 0.927269] Modules linked in: [ 0.927269] [ 0.927269] Pid: 271, comm: kapmd Not tainted (2.6.31-rc3-00100-gd520da1-dirty #311) System Product Name [ 0.927269] EIP: 00c0:[<000082b2>] EFLAGS: 00010002 CPU: 0 [ 0.927269] EIP is at 0x82b2 [ 0.927269] EAX: 0000530e EBX: 00000000 ECX: 00000102 EDX: 00000000 [ 0.927269] ESI: 00000000 EDI: f6a4bf44 EBP: 67890000 ESP: f6a4beec [ 0.927269] DS: 00c8 ES: 0000 FS: 0000 GS: 0000 SS: 0068 [ 0.927269] Process kapmd (pid: 271, ti=f6a4a000 task=f7142280 task.ti=f6a4a000) [ 0.927269] Stack: [ 0.927269] 0000828d 02160000 00b88092 f6a4bf3c c102a63d 00000060 f6a4bf3c f6a4bf44 [ 0.927269] <0> 0000007b 0000007b 00000000 00000000 00000000 00000000 560aae9e 00000000 [ 0.927269] <0> 00000200 f705fd74 00000000 c102af70 f6a4bf60 c102a6ec 0000530e 00000000 [ 0.927269] Call Trace: [ 0.927269] [] ? __apm_bios_call_simple+0x7d/0x110 [ 0.927269] [] ? apm+0x0/0x6a0 [ 0.927269] [] ? apm_bios_call_simple+0x1c/0x50 [ 0.927269] [] ? apm+0x485/0x6a0 [ 0.927269] [] ? finish_task_switch+0x2a/0xb0 [ 0.927269] [] ? schedule+0x31e/0x480 [ 0.927269] [] ? apm+0x0/0x6a0 [ 0.927269] [] ? apm+0x0/0x6a0 [ 0.927269] [] ? kthread+0x74/0x80 [ 0.927269] [] ? kthread+0x0/0x80 [ 0.927269] [] ? kernel_thread_helper+0x7/0x10 [ 0.927269] Code: Bad EIP value. [ 0.927269] EIP: [<000082b2>] 0x82b2 SS:ESP 0068:f6a4beec [ 0.927269] ---[ end trace a7919e7f17c0a725 ]--- [ 0.927269] Kernel panic - not syncing: Fatal exception [ 0.927269] Pid: 271, comm: kapmd Tainted: G D 2.6.31-rc3-00100-gd520da1-dirty #311 Is caused by an incorrect GDT_ENTRY_INIT() conversion in the apm code, as noticed by hpa. Reported-by: Ingo Molnar Noticed-by: "H. Peter Anvin" Signed-off-by: Akinobu Mita LKML-Reference: <20090808094905.GA2954@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8c9bc287f8fb..b5764a269645 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -106,7 +106,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { /* 16-bit code */ [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), /* data */ - [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), + [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff), [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), -- cgit v1.2.3 From fb82ad719831db58e9baa4c67015aae3fe27e7e3 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sat, 8 Aug 2009 10:49:36 -0500 Subject: tracing/filters: Don't use pred on alloc failure Dan Carpenter sent me a fix to prevent pred from being used if it couldn't be allocated. This updates his patch for the same problem in the tracing tree (which has changed this code quite substantially). Reported-by: Dan Carpenter Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Li Zefan LKML-Reference: <1249746576.6453.30.camel@tropicana> Signed-off-by: Ingo Molnar The original report: create_logical_pred() could sometimes return NULL. It's a static checker complaining rather than problems at runtime... --- kernel/trace/trace_events_filter.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 27c2dbea3710..490337abed75 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1050,6 +1050,8 @@ static int replace_preds(struct event_subsystem *system, pred = create_pred(elt->op, operand1, operand2); add_pred: + if (!pred) + return -ENOMEM; if (call) err = filter_add_pred(ps, call, pred, false); else -- cgit v1.2.3 From 4c711576b90cc36c13b94816a953a8de6a53d03c Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Thu, 6 Aug 2009 15:58:12 -0700 Subject: x86, 32-bit: Use generic sys_pipe() As suggested by Al, it's better to use the generic sys_pipe() for ia32. Signed-off-by: WANG Cong Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 2 +- arch/x86/ia32/sys_ia32.c | 14 -------------- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index e590261ba059..ba331bfd1112 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -537,7 +537,7 @@ ia32_sys_call_table: .quad sys_mkdir .quad sys_rmdir /* 40 */ .quad sys_dup - .quad sys32_pipe + .quad sys_pipe .quad compat_sys_times .quad quiet_ni_syscall /* old prof syscall holder */ .quad sys_brk /* 45 */ diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 085a8c35f149..9f5527198825 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -189,20 +189,6 @@ asmlinkage long sys32_mprotect(unsigned long start, size_t len, return sys_mprotect(start, len, prot); } -asmlinkage long sys32_pipe(int __user *fd) -{ - int retval; - int fds[2]; - - retval = do_pipe_flags(fds, 0); - if (retval) - goto out; - if (copy_to_user(fd, fds, sizeof(fds))) - retval = -EFAULT; -out: - return retval; -} - asmlinkage long sys32_rt_sigaction(int sig, struct sigaction32 __user *act, struct sigaction32 __user *oact, unsigned int sigsetsize) -- cgit v1.2.3 From 42c2c8c854a716b05882a122632ddcd6dbe108f1 Mon Sep 17 00:00:00 2001 From: Sonic Zhang Date: Thu, 6 Aug 2009 15:58:11 -0700 Subject: printk: Fix "printk: Enable the use of more than one CON_BOOT (early console)" Don't return when we find the first bootconsole - it can leave other bootconsoles still installed, and they can be used and cause problems later (if they are in the init section, and eventually released), and cause problems. Make sure we remove all of them. Signed-off-by: Sonic Zhang Signed-off-by: Robin Getz Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/printk.c b/kernel/printk.c index e0daaf57985a..e10d193a833a 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1352,7 +1352,7 @@ static int __init disable_boot_consoles(void) if (con->flags & CON_BOOT) { printk(KERN_INFO "turn off boot console %s%d\n", con->name, con->index); - return unregister_console(con); + unregister_console(con); } } return 0; -- cgit v1.2.3 From c36ba80ea01d0aecb652c26799a912e760ce8981 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Thu, 6 Aug 2009 21:46:03 +0200 Subject: irq: Remove superfluous NULL pointer check in check_irq_resend() This takes care of the following entry from Dan's list: kernel/irq/resend.c +73 check_irq_resend(17) warning: variable derefenced before check 'desc->chip' Reported-by: Dan Carpenter Signed-off-by: Bartlomiej Zolnierkiewicz Cc: Jonathan Corbet Cc: Eugene Teo Cc: Julia Lawall LKML-Reference: <200908062146.03638.bzolnier@gmail.com> Signed-off-by: Ingo Molnar --- kernel/irq/resend.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 89c7117acf2b..090c3763f3a2 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -70,8 +70,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; - if (!desc->chip || !desc->chip->retrigger || - !desc->chip->retrigger(irq)) { + if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) { #ifdef CONFIG_HARDIRQS_SW_RESEND /* Set it pending and activate the softirq: */ set_bit(irq, irqs_resend); -- cgit v1.2.3 From 30dd568c912602b7dbd609a45d053e01b13422bb Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Tue, 21 Jul 2009 15:56:48 +0200 Subject: x86, perf_counter, bts: Add BTS support to perfcounters Implement a performance counter with: attr.type = PERF_TYPE_HARDWARE attr.config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS attr.sample_period = 1 Using branch trace store (BTS) on x86 hardware, if available. The from and to address for each branch can be sampled using: PERF_SAMPLE_IP for the from address PERF_SAMPLE_ADDR for the to address [ v2: address review feedback, fix bugs ] Signed-off-by: Markus Metzger Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_counter.h | 10 ++ arch/x86/kernel/cpu/perf_counter.c | 325 +++++++++++++++++++++++++++++++++++- include/linux/perf_counter.h | 2 + kernel/perf_counter.c | 10 +- 4 files changed, 340 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index fa64e401589d..e7b7c938ae27 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h @@ -84,6 +84,16 @@ union cpuid10_edx { #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b #define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) +/* + * We model BTS tracing as another fixed-mode PMC. + * + * We choose a value in the middle of the fixed counter range, since lower + * values are used by actual fixed counters and higher values are used + * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. + */ +#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) + + #ifdef CONFIG_PERF_COUNTERS extern void init_hw_perf_counters(void); extern void perf_counters_lapic_init(void); diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a7aa8f900954..b237c181aa41 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -6,6 +6,7 @@ * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2009 Intel Corporation, * * For licencing details see kernel-base/COPYING */ @@ -20,6 +21,7 @@ #include #include #include +#include #include #include @@ -27,12 +29,52 @@ static u64 perf_counter_mask __read_mostly; +/* The maximal number of PEBS counters: */ +#define MAX_PEBS_COUNTERS 4 + +/* The size of a BTS record in bytes: */ +#define BTS_RECORD_SIZE 24 + +/* The size of a per-cpu BTS buffer in bytes: */ +#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 1024) + +/* The BTS overflow threshold in bytes from the end of the buffer: */ +#define BTS_OVFL_TH (BTS_RECORD_SIZE * 64) + + +/* + * Bits in the debugctlmsr controlling branch tracing. + */ +#define X86_DEBUGCTL_TR (1 << 6) +#define X86_DEBUGCTL_BTS (1 << 7) +#define X86_DEBUGCTL_BTINT (1 << 8) +#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9) +#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10) + +/* + * A debug store configuration. + * + * We only support architectures that use 64bit fields. + */ +struct debug_store { + u64 bts_buffer_base; + u64 bts_index; + u64 bts_absolute_maximum; + u64 bts_interrupt_threshold; + u64 pebs_buffer_base; + u64 pebs_index; + u64 pebs_absolute_maximum; + u64 pebs_interrupt_threshold; + u64 pebs_counter_reset[MAX_PEBS_COUNTERS]; +}; + struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; int enabled; + struct debug_store *ds; }; /* @@ -57,6 +99,8 @@ struct x86_pmu { u64 counter_mask; u64 max_period; u64 intel_ctrl; + void (*enable_bts)(u64 config); + void (*disable_bts)(void); }; static struct x86_pmu x86_pmu __read_mostly; @@ -576,6 +620,9 @@ x86_perf_counter_update(struct perf_counter *counter, u64 prev_raw_count, new_raw_count; s64 delta; + if (idx == X86_PMC_IDX_FIXED_BTS) + return 0; + /* * Careful: an NMI might modify the previous counter value. * @@ -659,10 +706,109 @@ static void release_pmc_hardware(void) enable_lapic_nmi_watchdog(); } +static inline bool bts_available(void) +{ + return x86_pmu.enable_bts != NULL; +} + +static inline void init_debug_store_on_cpu(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; + + if (!ds) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, + (u32)((u64)(long)ds), (u32)((u64)(long)ds >> 32)); +} + +static inline void fini_debug_store_on_cpu(int cpu) +{ + if (!per_cpu(cpu_hw_counters, cpu).ds) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); +} + +static void release_bts_hardware(void) +{ + int cpu; + + if (!bts_available()) + return; + + get_online_cpus(); + + for_each_online_cpu(cpu) + fini_debug_store_on_cpu(cpu); + + for_each_possible_cpu(cpu) { + struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; + + if (!ds) + continue; + + per_cpu(cpu_hw_counters, cpu).ds = NULL; + + kfree((void *)(long)ds->bts_buffer_base); + kfree(ds); + } + + put_online_cpus(); +} + +static int reserve_bts_hardware(void) +{ + int cpu, err = 0; + + if (!bts_available()) + return -EOPNOTSUPP; + + get_online_cpus(); + + for_each_possible_cpu(cpu) { + struct debug_store *ds; + void *buffer; + + err = -ENOMEM; + buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL); + if (unlikely(!buffer)) + break; + + ds = kzalloc(sizeof(*ds), GFP_KERNEL); + if (unlikely(!ds)) { + kfree(buffer); + break; + } + + ds->bts_buffer_base = (u64)(long)buffer; + ds->bts_index = ds->bts_buffer_base; + ds->bts_absolute_maximum = + ds->bts_buffer_base + BTS_BUFFER_SIZE; + ds->bts_interrupt_threshold = + ds->bts_absolute_maximum - BTS_OVFL_TH; + + per_cpu(cpu_hw_counters, cpu).ds = ds; + err = 0; + } + + if (err) + release_bts_hardware(); + else { + for_each_online_cpu(cpu) + init_debug_store_on_cpu(cpu); + } + + put_online_cpus(); + + return err; +} + static void hw_perf_counter_destroy(struct perf_counter *counter) { if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { release_pmc_hardware(); + release_bts_hardware(); mutex_unlock(&pmc_reserve_mutex); } } @@ -705,6 +851,42 @@ set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) return 0; } +static void intel_pmu_enable_bts(u64 config) +{ + unsigned long debugctlmsr; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr |= X86_DEBUGCTL_TR; + debugctlmsr |= X86_DEBUGCTL_BTS; + debugctlmsr |= X86_DEBUGCTL_BTINT; + + if (!(config & ARCH_PERFMON_EVENTSEL_OS)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; + + if (!(config & ARCH_PERFMON_EVENTSEL_USR)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; + + update_debugctlmsr(debugctlmsr); +} + +static void intel_pmu_disable_bts(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + unsigned long debugctlmsr; + + if (!cpuc->ds) + return; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr &= + ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | + X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); + + update_debugctlmsr(debugctlmsr); +} + /* * Setup the hardware configuration for a given attr_type */ @@ -721,9 +903,13 @@ static int __hw_perf_counter_init(struct perf_counter *counter) err = 0; if (!atomic_inc_not_zero(&active_counters)) { mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware()) - err = -EBUSY; - else + if (atomic_read(&active_counters) == 0) { + if (!reserve_pmc_hardware()) + err = -EBUSY; + else + reserve_bts_hardware(); + } + if (!err) atomic_inc(&active_counters); mutex_unlock(&pmc_reserve_mutex); } @@ -801,7 +987,18 @@ static void p6_pmu_disable_all(void) static void intel_pmu_disable_all(void) { + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + if (!cpuc->enabled) + return; + + cpuc->enabled = 0; + barrier(); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + intel_pmu_disable_bts(); } static void amd_pmu_disable_all(void) @@ -859,7 +1056,25 @@ static void p6_pmu_enable_all(void) static void intel_pmu_enable_all(void) { + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + if (cpuc->enabled) + return; + + cpuc->enabled = 1; + barrier(); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { + struct perf_counter *counter = + cpuc->counters[X86_PMC_IDX_FIXED_BTS]; + + if (WARN_ON_ONCE(!counter)) + return; + + intel_pmu_enable_bts(counter->hw.config); + } } static void amd_pmu_enable_all(void) @@ -946,6 +1161,11 @@ p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) static inline void intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) { + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + intel_pmu_disable_bts(); + return; + } + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_disable_fixed(hwc, idx); return; @@ -974,6 +1194,9 @@ x86_perf_counter_set_period(struct perf_counter *counter, s64 period = hwc->sample_period; int err, ret = 0; + if (idx == X86_PMC_IDX_FIXED_BTS) + return 0; + /* * If we are way outside a reasoable range then just skip forward: */ @@ -1056,6 +1279,14 @@ static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + if (!__get_cpu_var(cpu_hw_counters).enabled) + return; + + intel_pmu_enable_bts(hwc->config); + return; + } + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_enable_fixed(hwc, idx); return; @@ -1077,11 +1308,16 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) { unsigned int event; + event = hwc->config & ARCH_PERFMON_EVENT_MASK; + + if (unlikely((event == + x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && + (hwc->sample_period == 1))) + return X86_PMC_IDX_FIXED_BTS; + if (!x86_pmu.num_counters_fixed) return -1; - event = hwc->config & ARCH_PERFMON_EVENT_MASK; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) @@ -1102,7 +1338,25 @@ static int x86_pmu_enable(struct perf_counter *counter) int idx; idx = fixed_mode_idx(counter, hwc); - if (idx >= 0) { + if (idx == X86_PMC_IDX_FIXED_BTS) { + /* + * Try to use BTS for branch tracing. If that is not + * available, try to get a generic counter. + */ + if (unlikely(!cpuc->ds)) + goto try_generic; + + /* + * Try to get the fixed counter, if that is already taken + * then try to get a generic counter: + */ + if (test_and_set_bit(idx, cpuc->used_mask)) + goto try_generic; + + hwc->config_base = 0; + hwc->counter_base = 0; + hwc->idx = idx; + } else if (idx >= 0) { /* * Try to get the fixed counter, if that is already taken * then try to get a generic counter: @@ -1213,6 +1467,45 @@ void perf_counter_print_debug(void) local_irq_restore(flags); } +static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc, + struct perf_sample_data *data) +{ + struct debug_store *ds = cpuc->ds; + struct bts_record { + u64 from; + u64 to; + u64 flags; + }; + struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS]; + unsigned long orig_ip = data->regs->ip; + u64 at; + + if (!counter) + return; + + if (!ds) + return; + + for (at = ds->bts_buffer_base; + at < ds->bts_index; + at += sizeof(struct bts_record)) { + struct bts_record *rec = (struct bts_record *)(long)at; + + data->regs->ip = rec->from; + data->addr = rec->to; + + perf_counter_output(counter, 1, data); + } + + ds->bts_index = ds->bts_buffer_base; + + data->regs->ip = orig_ip; + data->addr = 0; + + /* There's new data available. */ + counter->pending_kill = POLL_IN; +} + static void x86_pmu_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); @@ -1237,6 +1530,15 @@ static void x86_pmu_disable(struct perf_counter *counter) * that we are disabling: */ x86_perf_counter_update(counter, hwc, idx); + + /* Drain the remaining BTS records. */ + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + struct perf_sample_data data; + struct pt_regs regs; + + data.regs = ®s; + intel_pmu_drain_bts_buffer(cpuc, &data); + } cpuc->counters[idx] = NULL; clear_bit(idx, cpuc->used_mask); @@ -1264,6 +1566,7 @@ static int intel_pmu_save_and_restart(struct perf_counter *counter) static void intel_pmu_reset(void) { + struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds; unsigned long flags; int idx; @@ -1281,6 +1584,8 @@ static void intel_pmu_reset(void) for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); } + if (ds) + ds->bts_index = ds->bts_buffer_base; local_irq_restore(flags); } @@ -1346,6 +1651,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) cpuc = &__get_cpu_var(cpu_hw_counters); perf_disable(); + intel_pmu_drain_bts_buffer(cpuc, &data); status = intel_pmu_get_status(); if (!status) { perf_enable(); @@ -1547,6 +1853,8 @@ static struct x86_pmu intel_pmu = { * the generic counter period: */ .max_period = (1ULL << 31) - 1, + .enable_bts = intel_pmu_enable_bts, + .disable_bts = intel_pmu_disable_bts, }; static struct x86_pmu amd_pmu = { @@ -1936,3 +2244,8 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } + +void hw_perf_counter_setup_online(int cpu) +{ + init_debug_store_on_cpu(cpu); +} diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2aabe43c1d04..0a6f3209c9de 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -692,6 +692,8 @@ struct perf_sample_data { extern int perf_counter_overflow(struct perf_counter *counter, int nmi, struct perf_sample_data *data); +extern void perf_counter_output(struct perf_counter *counter, int nmi, + struct perf_sample_data *data); /* * Return 1 for a software counter, 0 for a hardware counter diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 546e62d62941..bf8110b35c51 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -88,6 +88,7 @@ void __weak hw_perf_disable(void) { barrier(); } void __weak hw_perf_enable(void) { barrier(); } void __weak hw_perf_counter_setup(int cpu) { barrier(); } +void __weak hw_perf_counter_setup_online(int cpu) { barrier(); } int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, @@ -2630,7 +2631,7 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) return task_pid_nr_ns(p, counter->ns); } -static void perf_counter_output(struct perf_counter *counter, int nmi, +void perf_counter_output(struct perf_counter *counter, int nmi, struct perf_sample_data *data) { int ret; @@ -4566,6 +4567,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) perf_counter_init_cpu(cpu); break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + hw_perf_counter_setup_online(cpu); + break; + case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: perf_counter_exit_cpu(cpu); @@ -4590,6 +4596,8 @@ void __init perf_counter_init(void) { perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, + (void *)(long)smp_processor_id()); register_cpu_notifier(&perf_cpu_nb); } -- cgit v1.2.3 From 8d51327090ac025d7f4ce6c059786b5e93513321 Mon Sep 17 00:00:00 2001 From: Brice Goglin Date: Fri, 7 Aug 2009 13:55:24 +0200 Subject: perf report: Fix and improve the displaying of per-thread event counters Improve and fix the handling of per-thread counter stats recorded via perf record -s. Previously we only displayed it in debug printouts (-D) and even that output was hard to disambiguate. I moved everything to utils/values.[ch] so that we may reuse it in perf stat. We get something like this now: # PID TID cache-misses cache-references 4658 4659 495581 3238779 4658 4662 498246 3236823 4658 4663 499531 3243162 Then it'll be easy to add --pretty=raw to display a single line per thread/event. By the way, -S was also used for --symbol... So I used -T/--thread here. perf report: Add -T/--threads to display per-thread counter values We get something like this now: # PID TID cache-misses cache-references 4658 4659 495581 3238779 4658 4662 498246 3236823 4658 4663 499531 3243162 Per-thread arrays of counter values are managed in utils/values.[ch] Signed-off-by: Brice Goglin Cc: Peter Zijlstra Cc: paulus@samba.org Signed-off-by: Ingo Molnar --- tools/perf/Documentation/perf-report.txt | 3 + tools/perf/Makefile | 2 + tools/perf/builtin-report.c | 25 +++++ tools/perf/util/values.c | 171 +++++++++++++++++++++++++++++++ tools/perf/util/values.h | 26 +++++ 5 files changed, 227 insertions(+) create mode 100644 tools/perf/util/values.c create mode 100644 tools/perf/util/values.h diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index e72e93110782..370344afb5b2 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -27,6 +27,9 @@ OPTIONS -n --show-nr-samples Show the number of samples for each symbol +-T +--threads + Show per-thread event counters -C:: --comms=:: Only consider symbols in these comms. CSV that understands diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 60411e94113b..de7beac1095e 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -310,6 +310,7 @@ LIB_H += util/sigchain.h LIB_H += util/symbol.h LIB_H += util/module.h LIB_H += util/color.h +LIB_H += util/values.h LIB_OBJS += util/abspath.o LIB_OBJS += util/alias.o @@ -337,6 +338,7 @@ LIB_OBJS += util/color.o LIB_OBJS += util/pager.o LIB_OBJS += util/header.o LIB_OBJS += util/callchain.o +LIB_OBJS += util/values.o BUILTIN_OBJS += builtin-annotate.o BUILTIN_OBJS += builtin-help.o diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 99274cec0adb..41639182fb3f 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -17,6 +17,7 @@ #include "util/string.h" #include "util/callchain.h" #include "util/strlist.h" +#include "util/values.h" #include "perf.h" #include "util/header.h" @@ -53,6 +54,9 @@ static int modules; static int full_paths; static int show_nr_samples; +static int show_threads; +static struct perf_read_values show_threads_values; + static unsigned long page_size; static unsigned long mmap_window = 32; @@ -1473,6 +1477,9 @@ print_entries: free(rem_sq_bracket); + if (show_threads) + perf_read_values_display(fp, &show_threads_values); + return ret; } @@ -1758,6 +1765,16 @@ process_read_event(event_t *event, unsigned long offset, unsigned long head) { struct perf_counter_attr *attr = perf_header__find_attr(event->read.id); + if (show_threads) { + char *name = attr ? __event_name(attr->type, attr->config) + : "unknown"; + perf_read_values_add_value(&show_threads_values, + event->read.pid, event->read.tid, + event->read.id, + name, + event->read.value); + } + dprintf("%p [%p]: PERF_EVENT_READ: %d %d %s %Lu\n", (void *)(offset + head), (void *)(long)(event->header.size), @@ -1839,6 +1856,9 @@ static int __cmd_report(void) register_idle_thread(); + if (show_threads) + perf_read_values_init(&show_threads_values); + input = open(input_name, O_RDONLY); if (input < 0) { fprintf(stderr, " failed to open file: %s", input_name); @@ -1993,6 +2013,9 @@ done: output__resort(total); output__fprintf(stdout, total); + if (show_threads) + perf_read_values_destroy(&show_threads_values); + return rc; } @@ -2066,6 +2089,8 @@ static const struct option options[] = { "load module symbols - WARNING: use only with -k and LIVE kernel"), OPT_BOOLEAN('n', "show-nr-samples", &show_nr_samples, "Show a column with the number of samples"), + OPT_BOOLEAN('T', "threads", &show_threads, + "Show per-thread event counters"), OPT_STRING('s', "sort", &sort_order, "key[,key2...]", "sort by key(s): pid, comm, dso, symbol, parent"), OPT_BOOLEAN('P', "full-paths", &full_paths, diff --git a/tools/perf/util/values.c b/tools/perf/util/values.c new file mode 100644 index 000000000000..8551c0b8b233 --- /dev/null +++ b/tools/perf/util/values.c @@ -0,0 +1,171 @@ +#include + +#include "util.h" +#include "values.h" + +void perf_read_values_init(struct perf_read_values *values) +{ + values->threads_max = 16; + values->pid = malloc(values->threads_max * sizeof(*values->pid)); + values->tid = malloc(values->threads_max * sizeof(*values->tid)); + values->value = malloc(values->threads_max * sizeof(*values->value)); + if (!values->pid || !values->tid || !values->value) + die("failed to allocate read_values threads arrays"); + values->threads = 0; + + values->counters_max = 16; + values->counterrawid = malloc(values->counters_max + * sizeof(*values->counterrawid)); + values->countername = malloc(values->counters_max + * sizeof(*values->countername)); + if (!values->counterrawid || !values->countername) + die("failed to allocate read_values counters arrays"); + values->counters = 0; +} + +void perf_read_values_destroy(struct perf_read_values *values) +{ + int i; + + if (!values->threads_max || !values->counters_max) + return; + + for (i = 0; i < values->threads; i++) + free(values->value[i]); + free(values->pid); + free(values->tid); + free(values->counterrawid); + for (i = 0; i < values->counters; i++) + free(values->countername[i]); + free(values->countername); +} + +static void perf_read_values__enlarge_threads(struct perf_read_values *values) +{ + values->threads_max *= 2; + values->pid = realloc(values->pid, + values->threads_max * sizeof(*values->pid)); + values->tid = realloc(values->tid, + values->threads_max * sizeof(*values->tid)); + values->value = realloc(values->value, + values->threads_max * sizeof(*values->value)); + if (!values->pid || !values->tid || !values->value) + die("failed to enlarge read_values threads arrays"); +} + +static int perf_read_values__findnew_thread(struct perf_read_values *values, + u32 pid, u32 tid) +{ + int i; + + for (i = 0; i < values->threads; i++) + if (values->pid[i] == pid && values->tid[i] == tid) + return i; + + if (values->threads == values->threads_max) + perf_read_values__enlarge_threads(values); + + i = values->threads++; + values->pid[i] = pid; + values->tid[i] = tid; + values->value[i] = malloc(values->counters_max * sizeof(**values->value)); + if (!values->value[i]) + die("failed to allocate read_values counters array"); + + return i; +} + +static void perf_read_values__enlarge_counters(struct perf_read_values *values) +{ + int i; + + values->counters_max *= 2; + values->counterrawid = realloc(values->counterrawid, + values->counters_max * sizeof(*values->counterrawid)); + values->countername = realloc(values->countername, + values->counters_max * sizeof(*values->countername)); + if (!values->counterrawid || !values->countername) + die("failed to enlarge read_values counters arrays"); + + for (i = 0; i < values->threads; i++) { + values->value[i] = realloc(values->value[i], + values->counters_max * sizeof(**values->value)); + if (!values->value[i]) + die("failed to enlarge read_values counters arrays"); + } +} + +static int perf_read_values__findnew_counter(struct perf_read_values *values, + u64 rawid, char *name) +{ + int i; + + for (i = 0; i < values->counters; i++) + if (values->counterrawid[i] == rawid) + return i; + + if (values->counters == values->counters_max) + perf_read_values__enlarge_counters(values); + + i = values->counters++; + values->counterrawid[i] = rawid; + values->countername[i] = strdup(name); + + return i; +} + +void perf_read_values_add_value(struct perf_read_values *values, + u32 pid, u32 tid, + u64 rawid, char *name, u64 value) +{ + int tindex, cindex; + + tindex = perf_read_values__findnew_thread(values, pid, tid); + cindex = perf_read_values__findnew_counter(values, rawid, name); + + values->value[tindex][cindex] = value; +} + +void perf_read_values_display(FILE *fp, struct perf_read_values *values) +{ + int i, j; + int pidwidth, tidwidth; + int *counterwidth; + + counterwidth = malloc(values->counters * sizeof(*counterwidth)); + if (!counterwidth) + die("failed to allocate counterwidth array"); + tidwidth = 3; + pidwidth = 3; + for (j = 0; j < values->counters; j++) + counterwidth[j] = strlen(values->countername[j]); + for (i = 0; i < values->threads; i++) { + int width; + + width = snprintf(NULL, 0, "%d", values->pid[i]); + if (width > pidwidth) + pidwidth = width; + width = snprintf(NULL, 0, "%d", values->tid[i]); + if (width > tidwidth) + tidwidth = width; + for (j = 0; j < values->counters; j++) { + width = snprintf(NULL, 0, "%Lu", values->value[i][j]); + if (width > counterwidth[j]) + counterwidth[j] = width; + } + } + + fprintf(fp, "# %*s %*s", pidwidth, "PID", tidwidth, "TID"); + for (j = 0; j < values->counters; j++) + fprintf(fp, " %*s", counterwidth[j], values->countername[j]); + fprintf(fp, "\n"); + + for (i = 0; i < values->threads; i++) { + fprintf(fp, " %*d %*d", pidwidth, values->pid[i], + tidwidth, values->tid[i]); + for (j = 0; j < values->counters; j++) + fprintf(fp, " %*Lu", + counterwidth[j], values->value[i][j]); + fprintf(fp, "\n"); + } +} diff --git a/tools/perf/util/values.h b/tools/perf/util/values.h new file mode 100644 index 000000000000..e41be5e86e6b --- /dev/null +++ b/tools/perf/util/values.h @@ -0,0 +1,26 @@ +#ifndef _PERF_VALUES_H +#define _PERF_VALUES_H + +#include "types.h" + +struct perf_read_values { + int threads; + int threads_max; + u32 *pid, *tid; + int counters; + int counters_max; + u64 *counterrawid; + char **countername; + u64 **value; +}; + +void perf_read_values_init(struct perf_read_values *values); +void perf_read_values_destroy(struct perf_read_values *values); + +void perf_read_values_add_value(struct perf_read_values *values, + u32 pid, u32 tid, + u64 rawid, char *name, u64 value); + +void perf_read_values_display(FILE *fp, struct perf_read_values *values); + +#endif /* _PERF_VALUES_H */ -- cgit v1.2.3 From 8e5b59a2d728e6963b35dba8bb36e0b70267462e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 6 Aug 2009 16:02:50 -0700 Subject: sched: Add default defines for PREEMPT_ACTIVE The PREEMPT_ACTIVE setting doesn't actually need to be arch-specific, so set up a sane default for all arches to (hopefully) migrate to. > if we look at linux/hardirq.h, it makes this claim: > * - bit 28 is the PREEMPT_ACTIVE flag > if that's true, then why are we letting any arch set this define ? a > quick survey shows that half the arches (11) are using 0x10000000 (bit > 28) while the other half (10) are using 0x4000000 (bit 26). and then > there is the ia64 oddity which uses bit 30. the exact value here > shouldnt really matter across arches though should it ? actually alpha, arm and avr32 also use bit 30 (0x40000000), there are only five (or eight, depending on how you count) architectures (blackfin, h8300, m68k, s390 and sparc) using bit 26. Signed-off-by: Arnd Bergmann Signed-off-by: Mike Frysinger Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/hardirq.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 8246c697863d..0d885fd75111 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -64,6 +64,12 @@ #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) #define NMI_OFFSET (1UL << NMI_SHIFT) +#ifndef PREEMPT_ACTIVE +#define PREEMPT_ACTIVE_BITS 1 +#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS) +#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT) +#endif + #if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS)) #error PREEMPT_ACTIVE is too low! #endif -- cgit v1.2.3 From 9f51e24ee8b5a1595b6a5ac0c2be278a16488e75 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Sun, 9 Aug 2009 21:54:00 +0200 Subject: x86: Use printk_once() Signed-off-by: Marcin Slusarz Cc: "H. Peter Anvin" LKML-Reference: <1249847649-11631-6-git-send-email-marcin.slusarz@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_32.c | 5 ++--- arch/x86/kvm/x86.c | 7 +------ 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 3b09634a5153..7d35d0fe2329 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -218,7 +218,6 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) void fixup_irqs(void) { unsigned int irq; - static int warned; struct irq_desc *desc; for_each_irq_desc(irq, desc) { @@ -236,8 +235,8 @@ void fixup_irqs(void) } if (desc->chip->set_affinity) desc->chip->set_affinity(irq, affinity); - else if (desc->action && !(warned++)) - printk("Cannot set affinity for irq %i\n", irq); + else if (desc->action) + printk_once("Cannot set affinity for irq %i\n", irq); } #if 0 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fe5474aec41a..0572c90f0c84 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2261,12 +2261,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, unsigned int bytes, struct kvm_vcpu *vcpu) { - static int reported; - - if (!reported) { - reported = 1; - printk(KERN_WARNING "kvm: emulating exchange as write\n"); - } + printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); #ifndef CONFIG_X86_64 /* guests cmpxchg8b have to be emulated atomically */ if (bytes == 8) { -- cgit v1.2.3 From a8ad568dd8ca122aa8048ea067d3599820d1c1b4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 10 Aug 2009 11:53:10 +0900 Subject: dma-ops: Remove flush_write_buffers() in dma-mapping-common.h This moves flush_write_buffers() in asm-generic/dma-mapping-common.h to arch/x86/kernel/pci-nommu.c. The purpose of this patch is that, we can avoid defining NULL flush_write_buffers() on IA64 and SPARC. dma-mapping-common.h is used by X86 and IA64 (and SPARC soon) but only X86 with CONFIG_X86_OOSTORE or CONFIG_X86_PPRO_FENCE actually uses flush_write_buffers(). CONFIG_X86_OOSTORE or CONFIG_X86_PPRO_FENCE is usable with only kernel/pci-nommu.c (that is, not usable with other X86 IOMMU implementations such as SWIOTLB, VT-d, etc) so we can safely move flush_write_buffers() in asm-generic/dma-mapping-common.h to arch/x86/kernel/pci-nommu.c. The further discussion is: http://lkml.org/lkml/2009/6/28/104 Signed-off-by: Arnd Bergmann Acked-by: FUJITA Tomonori Cc: davem@davemloft.net Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com LKML-Reference: <1249872797-1314-2-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-nommu.c | 27 ++++++++++++++++++++++----- include/asm-generic/dma-mapping-common.h | 6 ------ 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index c0a4222bf62b..a3933d4330cd 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -79,12 +79,29 @@ static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr, free_pages((unsigned long)vaddr, get_order(size)); } +static void nommu_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir) +{ + flush_write_buffers(); +} + + +static void nommu_sync_sg_for_device(struct device *dev, + struct scatterlist *sg, int nelems, + enum dma_data_direction dir) +{ + flush_write_buffers(); +} + struct dma_map_ops nommu_dma_ops = { - .alloc_coherent = dma_generic_alloc_coherent, - .free_coherent = nommu_free_coherent, - .map_sg = nommu_map_sg, - .map_page = nommu_map_page, - .is_phys = 1, + .alloc_coherent = dma_generic_alloc_coherent, + .free_coherent = nommu_free_coherent, + .map_sg = nommu_map_sg, + .map_page = nommu_map_page, + .sync_single_for_device = nommu_sync_single_for_device, + .sync_sg_for_device = nommu_sync_sg_for_device, + .is_phys = 1, }; void __init no_iommu_init(void) diff --git a/include/asm-generic/dma-mapping-common.h b/include/asm-generic/dma-mapping-common.h index 5406a601185c..e694263445f7 100644 --- a/include/asm-generic/dma-mapping-common.h +++ b/include/asm-generic/dma-mapping-common.h @@ -103,7 +103,6 @@ static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, if (ops->sync_single_for_cpu) ops->sync_single_for_cpu(dev, addr, size, dir); debug_dma_sync_single_for_cpu(dev, addr, size, dir); - flush_write_buffers(); } static inline void dma_sync_single_for_device(struct device *dev, @@ -116,7 +115,6 @@ static inline void dma_sync_single_for_device(struct device *dev, if (ops->sync_single_for_device) ops->sync_single_for_device(dev, addr, size, dir); debug_dma_sync_single_for_device(dev, addr, size, dir); - flush_write_buffers(); } static inline void dma_sync_single_range_for_cpu(struct device *dev, @@ -132,7 +130,6 @@ static inline void dma_sync_single_range_for_cpu(struct device *dev, ops->sync_single_range_for_cpu(dev, addr, offset, size, dir); debug_dma_sync_single_range_for_cpu(dev, addr, offset, size, dir); - flush_write_buffers(); } else dma_sync_single_for_cpu(dev, addr, size, dir); } @@ -150,7 +147,6 @@ static inline void dma_sync_single_range_for_device(struct device *dev, ops->sync_single_range_for_device(dev, addr, offset, size, dir); debug_dma_sync_single_range_for_device(dev, addr, offset, size, dir); - flush_write_buffers(); } else dma_sync_single_for_device(dev, addr, size, dir); } @@ -165,7 +161,6 @@ dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, if (ops->sync_sg_for_cpu) ops->sync_sg_for_cpu(dev, sg, nelems, dir); debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir); - flush_write_buffers(); } static inline void @@ -179,7 +174,6 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, ops->sync_sg_for_device(dev, sg, nelems, dir); debug_dma_sync_sg_for_device(dev, sg, nelems, dir); - flush_write_buffers(); } #define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, NULL) -- cgit v1.2.3 From be02ff9940c0106dea1470462401a07c5d52e086 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 10 Aug 2009 11:53:11 +0900 Subject: IA64: Remove NULL flush_write_buffers flush_write_buffers() in dma-mapping-common.h was removed so we can remove NULL flush_write_buffers() in IA64. Signed-off-by: FUJITA Tomonori Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: davem@davemloft.net LKML-Reference: <1249872797-1314-3-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/dma-mapping.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h index f91829de329f..8d3c79cd81e7 100644 --- a/arch/ia64/include/asm/dma-mapping.h +++ b/arch/ia64/include/asm/dma-mapping.h @@ -44,7 +44,6 @@ static inline void dma_free_coherent(struct device *dev, size_t size, #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) #define get_dma_ops(dev) platform_dma_get_ops(dev) -#define flush_write_buffers() #include -- cgit v1.2.3 From bc0a14f154069cc4e42ea903c2c2b9984a94e4b7 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 10 Aug 2009 11:53:12 +0900 Subject: sparc: Use dma_map_ops struct Signed-off-by: FUJITA Tomonori Tested-by: Robert Reif Acked-by: David S. Miller Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com LKML-Reference: <1249872797-1314-4-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/sparc/include/asm/dma-mapping.h | 43 ++++++------------------------------ arch/sparc/kernel/dma.c | 16 +++++++++----- arch/sparc/kernel/iommu.c | 16 +++++++++----- arch/sparc/kernel/pci_sun4v.c | 14 +++++++----- 4 files changed, 36 insertions(+), 53 deletions(-) diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h index 204e4bf64438..893f3ecc9750 100644 --- a/arch/sparc/include/asm/dma-mapping.h +++ b/arch/sparc/include/asm/dma-mapping.h @@ -13,36 +13,7 @@ extern int dma_set_mask(struct device *dev, u64 dma_mask); #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) #define dma_is_consistent(d, h) (1) -struct dma_ops { - void *(*alloc_coherent)(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag); - void (*free_coherent)(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t dma_handle); - dma_addr_t (*map_page)(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction direction); - void (*unmap_page)(struct device *dev, dma_addr_t dma_addr, - size_t size, - enum dma_data_direction direction); - int (*map_sg)(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction direction); - void (*unmap_sg)(struct device *dev, struct scatterlist *sg, - int nhwentries, - enum dma_data_direction direction); - void (*sync_single_for_cpu)(struct device *dev, - dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction); - void (*sync_single_for_device)(struct device *dev, - dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction); - void (*sync_sg_for_cpu)(struct device *dev, struct scatterlist *sg, - int nelems, - enum dma_data_direction direction); - void (*sync_sg_for_device)(struct device *dev, - struct scatterlist *sg, int nents, - enum dma_data_direction dir); -}; -extern const struct dma_ops *dma_ops; +extern const struct dma_map_ops *dma_ops; static inline void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag) @@ -62,40 +33,40 @@ static inline dma_addr_t dma_map_single(struct device *dev, void *cpu_addr, { return dma_ops->map_page(dev, virt_to_page(cpu_addr), (unsigned long)cpu_addr & ~PAGE_MASK, size, - direction); + direction, NULL); } static inline void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, enum dma_data_direction direction) { - dma_ops->unmap_page(dev, dma_addr, size, direction); + dma_ops->unmap_page(dev, dma_addr, size, direction, NULL); } static inline dma_addr_t dma_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction direction) { - return dma_ops->map_page(dev, page, offset, size, direction); + return dma_ops->map_page(dev, page, offset, size, direction, NULL); } static inline void dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, enum dma_data_direction direction) { - dma_ops->unmap_page(dev, dma_address, size, direction); + dma_ops->unmap_page(dev, dma_address, size, direction, NULL); } static inline int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction) { - return dma_ops->map_sg(dev, sg, nents, direction); + return dma_ops->map_sg(dev, sg, nents, direction, NULL); } static inline void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction) { - dma_ops->unmap_sg(dev, sg, nents, direction); + dma_ops->unmap_sg(dev, sg, nents, direction, NULL); } static inline void dma_sync_single_for_cpu(struct device *dev, diff --git a/arch/sparc/kernel/dma.c b/arch/sparc/kernel/dma.c index 524c32f97c55..473a3fc7ab5b 100644 --- a/arch/sparc/kernel/dma.c +++ b/arch/sparc/kernel/dma.c @@ -60,7 +60,8 @@ static void dma32_free_coherent(struct device *dev, size_t size, static dma_addr_t dma32_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, - enum dma_data_direction direction) + enum dma_data_direction direction, + struct dma_attrs *attrs) { #ifdef CONFIG_PCI if (dev->bus == &pci_bus_type) @@ -72,7 +73,8 @@ static dma_addr_t dma32_map_page(struct device *dev, struct page *page, } static void dma32_unmap_page(struct device *dev, dma_addr_t dma_address, - size_t size, enum dma_data_direction direction) + size_t size, enum dma_data_direction direction, + struct dma_attrs *attrs) { #ifdef CONFIG_PCI if (dev->bus == &pci_bus_type) { @@ -85,7 +87,8 @@ static void dma32_unmap_page(struct device *dev, dma_addr_t dma_address, } static int dma32_map_sg(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction direction) + int nents, enum dma_data_direction direction, + struct dma_attrs *attrs) { #ifdef CONFIG_PCI if (dev->bus == &pci_bus_type) @@ -95,7 +98,8 @@ static int dma32_map_sg(struct device *dev, struct scatterlist *sg, } void dma32_unmap_sg(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction direction) + int nents, enum dma_data_direction direction, + struct dma_attrs *attrs) { #ifdef CONFIG_PCI if (dev->bus == &pci_bus_type) { @@ -161,7 +165,7 @@ static void dma32_sync_sg_for_device(struct device *dev, BUG(); } -static const struct dma_ops dma32_dma_ops = { +static const struct dma_map_ops dma32_dma_ops = { .alloc_coherent = dma32_alloc_coherent, .free_coherent = dma32_free_coherent, .map_page = dma32_map_page, @@ -174,5 +178,5 @@ static const struct dma_ops dma32_dma_ops = { .sync_sg_for_device = dma32_sync_sg_for_device, }; -const struct dma_ops *dma_ops = &dma32_dma_ops; +const struct dma_map_ops *dma_ops = &dma32_dma_ops; EXPORT_SYMBOL(dma_ops); diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c index 0aeaefe696b9..a9f0ad955186 100644 --- a/arch/sparc/kernel/iommu.c +++ b/arch/sparc/kernel/iommu.c @@ -353,7 +353,8 @@ static void dma_4u_free_coherent(struct device *dev, size_t size, static dma_addr_t dma_4u_map_page(struct device *dev, struct page *page, unsigned long offset, size_t sz, - enum dma_data_direction direction) + enum dma_data_direction direction, + struct dma_attrs *attrs) { struct iommu *iommu; struct strbuf *strbuf; @@ -474,7 +475,8 @@ do_flush_sync: } static void dma_4u_unmap_page(struct device *dev, dma_addr_t bus_addr, - size_t sz, enum dma_data_direction direction) + size_t sz, enum dma_data_direction direction, + struct dma_attrs *attrs) { struct iommu *iommu; struct strbuf *strbuf; @@ -520,7 +522,8 @@ static void dma_4u_unmap_page(struct device *dev, dma_addr_t bus_addr, } static int dma_4u_map_sg(struct device *dev, struct scatterlist *sglist, - int nelems, enum dma_data_direction direction) + int nelems, enum dma_data_direction direction, + struct dma_attrs *attrs) { struct scatterlist *s, *outs, *segstart; unsigned long flags, handle, prot, ctx; @@ -691,7 +694,8 @@ static unsigned long fetch_sg_ctx(struct iommu *iommu, struct scatterlist *sg) } static void dma_4u_unmap_sg(struct device *dev, struct scatterlist *sglist, - int nelems, enum dma_data_direction direction) + int nelems, enum dma_data_direction direction, + struct dma_attrs *attrs) { unsigned long flags, ctx; struct scatterlist *sg; @@ -822,7 +826,7 @@ static void dma_4u_sync_sg_for_cpu(struct device *dev, spin_unlock_irqrestore(&iommu->lock, flags); } -static const struct dma_ops sun4u_dma_ops = { +static const struct dma_map_ops sun4u_dma_ops = { .alloc_coherent = dma_4u_alloc_coherent, .free_coherent = dma_4u_free_coherent, .map_page = dma_4u_map_page, @@ -833,7 +837,7 @@ static const struct dma_ops sun4u_dma_ops = { .sync_sg_for_cpu = dma_4u_sync_sg_for_cpu, }; -const struct dma_ops *dma_ops = &sun4u_dma_ops; +const struct dma_map_ops *dma_ops = &sun4u_dma_ops; EXPORT_SYMBOL(dma_ops); int dma_supported(struct device *dev, u64 device_mask) diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c index 2485eaa23101..c4f7dce577dd 100644 --- a/arch/sparc/kernel/pci_sun4v.c +++ b/arch/sparc/kernel/pci_sun4v.c @@ -232,7 +232,8 @@ static void dma_4v_free_coherent(struct device *dev, size_t size, void *cpu, static dma_addr_t dma_4v_map_page(struct device *dev, struct page *page, unsigned long offset, size_t sz, - enum dma_data_direction direction) + enum dma_data_direction direction, + struct dma_attrs *attrs) { struct iommu *iommu; unsigned long flags, npages, oaddr; @@ -296,7 +297,8 @@ iommu_map_fail: } static void dma_4v_unmap_page(struct device *dev, dma_addr_t bus_addr, - size_t sz, enum dma_data_direction direction) + size_t sz, enum dma_data_direction direction, + struct dma_attrs *attrs) { struct pci_pbm_info *pbm; struct iommu *iommu; @@ -336,7 +338,8 @@ static void dma_4v_unmap_page(struct device *dev, dma_addr_t bus_addr, } static int dma_4v_map_sg(struct device *dev, struct scatterlist *sglist, - int nelems, enum dma_data_direction direction) + int nelems, enum dma_data_direction direction, + struct dma_attrs *attrs) { struct scatterlist *s, *outs, *segstart; unsigned long flags, handle, prot; @@ -478,7 +481,8 @@ iommu_map_failed: } static void dma_4v_unmap_sg(struct device *dev, struct scatterlist *sglist, - int nelems, enum dma_data_direction direction) + int nelems, enum dma_data_direction direction, + struct dma_attrs *attrs) { struct pci_pbm_info *pbm; struct scatterlist *sg; @@ -535,7 +539,7 @@ static void dma_4v_sync_sg_for_cpu(struct device *dev, /* Nothing to do... */ } -static const struct dma_ops sun4v_dma_ops = { +static const struct dma_map_ops sun4v_dma_ops = { .alloc_coherent = dma_4v_alloc_coherent, .free_coherent = dma_4v_free_coherent, .map_page = dma_4v_map_page, -- cgit v1.2.3 From 02f7a18935eef0e56d9180fc3c35da6aad1ff3bb Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 10 Aug 2009 11:53:13 +0900 Subject: sparc: Use asm-generic/dma-mapping-common.h Signed-off-by: FUJITA Tomonori Tested-by: Robert Reif Acked-by: David S. Miller Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com LKML-Reference: <1249872797-1314-5-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/sparc/Kconfig | 1 + arch/sparc/include/asm/dma-mapping.h | 107 +++++------------------------------ arch/sparc/kernel/dma.c | 4 +- arch/sparc/kernel/iommu.c | 4 +- arch/sparc/kernel/pci_sun4v.c | 2 +- 5 files changed, 19 insertions(+), 99 deletions(-) diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 3f8b6a92eabd..5f2df99645c9 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -25,6 +25,7 @@ config SPARC select ARCH_WANT_OPTIONAL_GPIOLIB select RTC_CLASS select RTC_DRV_M48T59 + select HAVE_DMA_ATTRS config SPARC32 def_bool !64BIT diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h index 893f3ecc9750..34c92264208a 100644 --- a/arch/sparc/include/asm/dma-mapping.h +++ b/arch/sparc/include/asm/dma-mapping.h @@ -3,6 +3,7 @@ #include #include +#include #define DMA_ERROR_CODE (~(dma_addr_t)0x0) @@ -13,113 +14,31 @@ extern int dma_set_mask(struct device *dev, u64 dma_mask); #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) #define dma_is_consistent(d, h) (1) -extern const struct dma_map_ops *dma_ops; +extern struct dma_map_ops *dma_ops; -static inline void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag) -{ - return dma_ops->alloc_coherent(dev, size, dma_handle, flag); -} - -static inline void dma_free_coherent(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t dma_handle) -{ - dma_ops->free_coherent(dev, size, cpu_addr, dma_handle); -} - -static inline dma_addr_t dma_map_single(struct device *dev, void *cpu_addr, - size_t size, - enum dma_data_direction direction) +static inline struct dma_map_ops *get_dma_ops(struct device *dev) { - return dma_ops->map_page(dev, virt_to_page(cpu_addr), - (unsigned long)cpu_addr & ~PAGE_MASK, size, - direction, NULL); + return dma_ops; } -static inline void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, - size_t size, - enum dma_data_direction direction) -{ - dma_ops->unmap_page(dev, dma_addr, size, direction, NULL); -} - -static inline dma_addr_t dma_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - return dma_ops->map_page(dev, page, offset, size, direction, NULL); -} +#include -static inline void dma_unmap_page(struct device *dev, dma_addr_t dma_address, - size_t size, - enum dma_data_direction direction) -{ - dma_ops->unmap_page(dev, dma_address, size, direction, NULL); -} - -static inline int dma_map_sg(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction direction) -{ - return dma_ops->map_sg(dev, sg, nents, direction, NULL); -} - -static inline void dma_unmap_sg(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction direction) -{ - dma_ops->unmap_sg(dev, sg, nents, direction, NULL); -} - -static inline void dma_sync_single_for_cpu(struct device *dev, - dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ - dma_ops->sync_single_for_cpu(dev, dma_handle, size, direction); -} - -static inline void dma_sync_single_for_device(struct device *dev, - dma_addr_t dma_handle, - size_t size, - enum dma_data_direction direction) -{ - if (dma_ops->sync_single_for_device) - dma_ops->sync_single_for_device(dev, dma_handle, size, - direction); -} - -static inline void dma_sync_sg_for_cpu(struct device *dev, - struct scatterlist *sg, int nelems, - enum dma_data_direction direction) +static inline void *dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) { - dma_ops->sync_sg_for_cpu(dev, sg, nelems, direction); -} + struct dma_map_ops *ops = get_dma_ops(dev); -static inline void dma_sync_sg_for_device(struct device *dev, - struct scatterlist *sg, int nelems, - enum dma_data_direction direction) -{ - if (dma_ops->sync_sg_for_device) - dma_ops->sync_sg_for_device(dev, sg, nelems, direction); + return ops->alloc_coherent(dev, size, dma_handle, flag); } -static inline void dma_sync_single_range_for_cpu(struct device *dev, - dma_addr_t dma_handle, - unsigned long offset, - size_t size, - enum dma_data_direction dir) +static inline void dma_free_coherent(struct device *dev, size_t size, + void *cpu_addr, dma_addr_t dma_handle) { - dma_sync_single_for_cpu(dev, dma_handle+offset, size, dir); -} + struct dma_map_ops *ops = get_dma_ops(dev); -static inline void dma_sync_single_range_for_device(struct device *dev, - dma_addr_t dma_handle, - unsigned long offset, - size_t size, - enum dma_data_direction dir) -{ - dma_sync_single_for_device(dev, dma_handle+offset, size, dir); + ops->free_coherent(dev, size, cpu_addr, dma_handle); } - static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { return (dma_addr == DMA_ERROR_CODE); diff --git a/arch/sparc/kernel/dma.c b/arch/sparc/kernel/dma.c index 473a3fc7ab5b..15820a918171 100644 --- a/arch/sparc/kernel/dma.c +++ b/arch/sparc/kernel/dma.c @@ -165,7 +165,7 @@ static void dma32_sync_sg_for_device(struct device *dev, BUG(); } -static const struct dma_map_ops dma32_dma_ops = { +static struct dma_map_ops dma32_dma_ops = { .alloc_coherent = dma32_alloc_coherent, .free_coherent = dma32_free_coherent, .map_page = dma32_map_page, @@ -178,5 +178,5 @@ static const struct dma_map_ops dma32_dma_ops = { .sync_sg_for_device = dma32_sync_sg_for_device, }; -const struct dma_map_ops *dma_ops = &dma32_dma_ops; +struct dma_map_ops *dma_ops = &dma32_dma_ops; EXPORT_SYMBOL(dma_ops); diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c index a9f0ad955186..74b289cab552 100644 --- a/arch/sparc/kernel/iommu.c +++ b/arch/sparc/kernel/iommu.c @@ -826,7 +826,7 @@ static void dma_4u_sync_sg_for_cpu(struct device *dev, spin_unlock_irqrestore(&iommu->lock, flags); } -static const struct dma_map_ops sun4u_dma_ops = { +static struct dma_map_ops sun4u_dma_ops = { .alloc_coherent = dma_4u_alloc_coherent, .free_coherent = dma_4u_free_coherent, .map_page = dma_4u_map_page, @@ -837,7 +837,7 @@ static const struct dma_map_ops sun4u_dma_ops = { .sync_sg_for_cpu = dma_4u_sync_sg_for_cpu, }; -const struct dma_map_ops *dma_ops = &sun4u_dma_ops; +struct dma_map_ops *dma_ops = &sun4u_dma_ops; EXPORT_SYMBOL(dma_ops); int dma_supported(struct device *dev, u64 device_mask) diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c index c4f7dce577dd..ee800f927929 100644 --- a/arch/sparc/kernel/pci_sun4v.c +++ b/arch/sparc/kernel/pci_sun4v.c @@ -539,7 +539,7 @@ static void dma_4v_sync_sg_for_cpu(struct device *dev, /* Nothing to do... */ } -static const struct dma_map_ops sun4v_dma_ops = { +static struct dma_map_ops sun4v_dma_ops = { .alloc_coherent = dma_4v_alloc_coherent, .free_coherent = dma_4v_free_coherent, .map_page = dma_4v_map_page, -- cgit v1.2.3 From 595cc8560783ea31ed1208dc1f97282a2a5606b7 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 10 Aug 2009 11:53:14 +0900 Subject: sparc: Remove no-op dma_4v_sync_single_for_cpu and dma_4v_sync_sg_for_cpu Now sparc uses include/asm-generic/dma-mapping-common.h. pci_sun4v.c doesn't need to have no-op dma_4v_sync_single_for_cpu and dma_4v_sync_sg_for_cpu (dma-mapping-common.h does nothing if sync_{single|sg}_for_cpu hook is not defined). So we can remove them safely. Signed-off-by: FUJITA Tomonori Tested-by: Robert Reif Acked-by: David S. Miller Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com LKML-Reference: <1249872797-1314-6-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/sparc/kernel/pci_sun4v.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c index ee800f927929..23c33ff9c31e 100644 --- a/arch/sparc/kernel/pci_sun4v.c +++ b/arch/sparc/kernel/pci_sun4v.c @@ -525,20 +525,6 @@ static void dma_4v_unmap_sg(struct device *dev, struct scatterlist *sglist, spin_unlock_irqrestore(&iommu->lock, flags); } -static void dma_4v_sync_single_for_cpu(struct device *dev, - dma_addr_t bus_addr, size_t sz, - enum dma_data_direction direction) -{ - /* Nothing to do... */ -} - -static void dma_4v_sync_sg_for_cpu(struct device *dev, - struct scatterlist *sglist, int nelems, - enum dma_data_direction direction) -{ - /* Nothing to do... */ -} - static struct dma_map_ops sun4v_dma_ops = { .alloc_coherent = dma_4v_alloc_coherent, .free_coherent = dma_4v_free_coherent, @@ -546,8 +532,6 @@ static struct dma_map_ops sun4v_dma_ops = { .unmap_page = dma_4v_unmap_page, .map_sg = dma_4v_map_sg, .unmap_sg = dma_4v_unmap_sg, - .sync_single_for_cpu = dma_4v_sync_single_for_cpu, - .sync_sg_for_cpu = dma_4v_sync_sg_for_cpu, }; static void __devinit pci_sun4v_scan_bus(struct pci_pbm_info *pbm, -- cgit v1.2.3 From c2c07dbd8742a26ab3f1ee8b82237a060a0d9f61 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 10 Aug 2009 11:53:15 +0900 Subject: sparc: Replace sbus_map_single and sbus_unmap_single with sbus_map_page and sbus_unmap_page This is a preparation for using asm-generic/pci-dma-compat.h; SPARC32 has two dma_map_ops structures for pci and sbus (removing arch/sparc/kernel/dma.c, PCI and SBUS DMA accessor). Signed-off-by: FUJITA Tomonori Tested-by: Robert Reif Acked-by: David S. Miller Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com LKML-Reference: <1249872797-1314-7-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/sparc/kernel/dma.c | 5 ++--- arch/sparc/kernel/dma.h | 6 +++--- arch/sparc/kernel/ioport.c | 7 +++++-- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/sparc/kernel/dma.c b/arch/sparc/kernel/dma.c index 15820a918171..a5d50dac735c 100644 --- a/arch/sparc/kernel/dma.c +++ b/arch/sparc/kernel/dma.c @@ -68,8 +68,7 @@ static dma_addr_t dma32_map_page(struct device *dev, struct page *page, return pci_map_page(to_pci_dev(dev), page, offset, size, (int)direction); #endif - return sbus_map_single(dev, page_address(page) + offset, - size, (int)direction); + return sbus_map_page(dev, page, offset, size, (int)direction); } static void dma32_unmap_page(struct device *dev, dma_addr_t dma_address, @@ -83,7 +82,7 @@ static void dma32_unmap_page(struct device *dev, dma_addr_t dma_address, return; } #endif - sbus_unmap_single(dev, dma_address, size, (int)direction); + sbus_unmap_page(dev, dma_address, size, (int)direction); } static int dma32_map_sg(struct device *dev, struct scatterlist *sg, diff --git a/arch/sparc/kernel/dma.h b/arch/sparc/kernel/dma.h index f8d8951adb53..680351ee0d40 100644 --- a/arch/sparc/kernel/dma.h +++ b/arch/sparc/kernel/dma.h @@ -1,8 +1,8 @@ void *sbus_alloc_consistent(struct device *dev, long len, u32 *dma_addrp); void sbus_free_consistent(struct device *dev, long n, void *p, u32 ba); -dma_addr_t sbus_map_single(struct device *dev, void *va, - size_t len, int direction); -void sbus_unmap_single(struct device *dev, dma_addr_t ba, +dma_addr_t sbus_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t len, int direction); +void sbus_unmap_page(struct device *dev, dma_addr_t ba, size_t n, int direction); int sbus_map_sg(struct device *dev, struct scatterlist *sg, int n, int direction); diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c index 87ea0d03d975..39ff1e0c5183 100644 --- a/arch/sparc/kernel/ioport.c +++ b/arch/sparc/kernel/ioport.c @@ -337,8 +337,11 @@ void sbus_free_consistent(struct device *dev, long n, void *p, u32 ba) * CPU view of this memory may be inconsistent with * a device view and explicit flushing is necessary. */ -dma_addr_t sbus_map_single(struct device *dev, void *va, size_t len, int direction) +dma_addr_t sbus_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t len, int direction) { + void *va = page_address(page) + offset; + /* XXX why are some lengths signed, others unsigned? */ if (len <= 0) { return 0; @@ -350,7 +353,7 @@ dma_addr_t sbus_map_single(struct device *dev, void *va, size_t len, int directi return mmu_get_scsi_one(dev, va, len); } -void sbus_unmap_single(struct device *dev, dma_addr_t ba, size_t n, int direction) +void sbus_unmap_page(struct device *dev, dma_addr_t ba, size_t n, int direction) { mmu_release_scsi_one(dev, ba, n); } -- cgit v1.2.3 From ee664a9252d24ef10317d1bba8fc8f4c6495b36c Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 10 Aug 2009 11:53:16 +0900 Subject: sparc: Use asm-generic/pci-dma-compat This converts SPARC to use asm-generic/pci-dma-compat instead of the homegrown mechnism. SPARC32 has two dma_map_ops structures for pci and sbus (removing arch/sparc/kernel/dma.c, PCI and SBUS DMA accessor). The global 'dma_ops' is set to sbus_dma_ops and get_dma_ops() returns pci32_dma_ops for pci devices so we can use the appropriate dma mapping operations. Signed-off-by: FUJITA Tomonori Tested-by: Robert Reif Acked-by: David S. Miller Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com LKML-Reference: <1249872797-1314-8-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/sparc/include/asm/dma-mapping.h | 7 +- arch/sparc/include/asm/pci.h | 3 + arch/sparc/include/asm/pci_32.h | 105 ----------------------- arch/sparc/include/asm/pci_64.h | 88 ------------------- arch/sparc/kernel/dma.c | 155 ++------------------------------- arch/sparc/kernel/dma.h | 14 --- arch/sparc/kernel/iommu.c | 4 +- arch/sparc/kernel/ioport.c | 162 ++++++++++++++++------------------- arch/sparc/kernel/pci.c | 2 +- 9 files changed, 96 insertions(+), 444 deletions(-) delete mode 100644 arch/sparc/kernel/dma.h diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h index 34c92264208a..2677818dc781 100644 --- a/arch/sparc/include/asm/dma-mapping.h +++ b/arch/sparc/include/asm/dma-mapping.h @@ -14,10 +14,15 @@ extern int dma_set_mask(struct device *dev, u64 dma_mask); #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) #define dma_is_consistent(d, h) (1) -extern struct dma_map_ops *dma_ops; +extern struct dma_map_ops *dma_ops, pci32_dma_ops; +extern struct bus_type pci_bus_type; static inline struct dma_map_ops *get_dma_ops(struct device *dev) { +#if defined(CONFIG_SPARC32) && defined(CONFIG_PCI) + if (dev->bus == &pci_bus_type) + return &pci32_dma_ops; +#endif return dma_ops; } diff --git a/arch/sparc/include/asm/pci.h b/arch/sparc/include/asm/pci.h index 6e14fd179335..d9c031f9910f 100644 --- a/arch/sparc/include/asm/pci.h +++ b/arch/sparc/include/asm/pci.h @@ -5,4 +5,7 @@ #else #include #endif + +#include + #endif diff --git a/arch/sparc/include/asm/pci_32.h b/arch/sparc/include/asm/pci_32.h index b41c4c198159..ac0e8369fd97 100644 --- a/arch/sparc/include/asm/pci_32.h +++ b/arch/sparc/include/asm/pci_32.h @@ -31,42 +31,8 @@ static inline void pcibios_penalize_isa_irq(int irq, int active) */ #define PCI_DMA_BUS_IS_PHYS (0) -#include - struct pci_dev; -/* Allocate and map kernel buffer using consistent mode DMA for a device. - * hwdev should be valid struct pci_dev pointer for PCI devices. - */ -extern void *pci_alloc_consistent(struct pci_dev *hwdev, size_t size, dma_addr_t *dma_handle); - -/* Free and unmap a consistent DMA buffer. - * cpu_addr is what was returned from pci_alloc_consistent, - * size must be the same as what as passed into pci_alloc_consistent, - * and likewise dma_addr must be the same as what *dma_addrp was set to. - * - * References to the memory and mappings assosciated with cpu_addr/dma_addr - * past this call are illegal. - */ -extern void pci_free_consistent(struct pci_dev *hwdev, size_t size, void *vaddr, dma_addr_t dma_handle); - -/* Map a single buffer of the indicated size for DMA in streaming mode. - * The 32-bit bus address to use is returned. - * - * Once the device is given the dma address, the device owns this memory - * until either pci_unmap_single or pci_dma_sync_single_for_cpu is performed. - */ -extern dma_addr_t pci_map_single(struct pci_dev *hwdev, void *ptr, size_t size, int direction); - -/* Unmap a single streaming mode DMA translation. The dma_addr and size - * must match what was provided for in a previous pci_map_single call. All - * other usages are undefined. - * - * After this call, reads by the cpu to the buffer are guaranteed to see - * whatever the device wrote there. - */ -extern void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr, size_t size, int direction); - /* pci_unmap_{single,page} is not a nop, thus... */ #define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \ dma_addr_t ADDR_NAME; @@ -81,69 +47,6 @@ extern void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr, size_t #define pci_unmap_len_set(PTR, LEN_NAME, VAL) \ (((PTR)->LEN_NAME) = (VAL)) -/* - * Same as above, only with pages instead of mapped addresses. - */ -extern dma_addr_t pci_map_page(struct pci_dev *hwdev, struct page *page, - unsigned long offset, size_t size, int direction); -extern void pci_unmap_page(struct pci_dev *hwdev, - dma_addr_t dma_address, size_t size, int direction); - -/* Map a set of buffers described by scatterlist in streaming - * mode for DMA. This is the scather-gather version of the - * above pci_map_single interface. Here the scatter gather list - * elements are each tagged with the appropriate dma address - * and length. They are obtained via sg_dma_{address,length}(SG). - * - * NOTE: An implementation may be able to use a smaller number of - * DMA address/length pairs than there are SG table elements. - * (for example via virtual mapping capabilities) - * The routine returns the number of addr/length pairs actually - * used, at most nents. - * - * Device ownership issues as mentioned above for pci_map_single are - * the same here. - */ -extern int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg, int nents, int direction); - -/* Unmap a set of streaming mode DMA translations. - * Again, cpu read rules concerning calls here are the same as for - * pci_unmap_single() above. - */ -extern void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg, int nhwents, int direction); - -/* Make physical memory consistent for a single - * streaming mode DMA translation after a transfer. - * - * If you perform a pci_map_single() but wish to interrogate the - * buffer using the cpu, yet do not wish to teardown the PCI dma - * mapping, you must call this function before doing so. At the - * next point you give the PCI dma address back to the card, you - * must first perform a pci_dma_sync_for_device, and then the device - * again owns the buffer. - */ -extern void pci_dma_sync_single_for_cpu(struct pci_dev *hwdev, dma_addr_t dma_handle, size_t size, int direction); -extern void pci_dma_sync_single_for_device(struct pci_dev *hwdev, dma_addr_t dma_handle, size_t size, int direction); - -/* Make physical memory consistent for a set of streaming - * mode DMA translations after a transfer. - * - * The same as pci_dma_sync_single_* but for a scatter-gather list, - * same rules and usage. - */ -extern void pci_dma_sync_sg_for_cpu(struct pci_dev *hwdev, struct scatterlist *sg, int nelems, int direction); -extern void pci_dma_sync_sg_for_device(struct pci_dev *hwdev, struct scatterlist *sg, int nelems, int direction); - -/* Return whether the given PCI device DMA address mask can - * be supported properly. For example, if your device can - * only drive the low 24-bits during PCI bus mastering, then - * you would pass 0x00ffffff as the mask to this function. - */ -static inline int pci_dma_supported(struct pci_dev *hwdev, u64 mask) -{ - return 1; -} - #ifdef CONFIG_PCI static inline void pci_dma_burst_advice(struct pci_dev *pdev, enum pci_dma_burst_strategy *strat, @@ -154,14 +57,6 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev, } #endif -#define PCI_DMA_ERROR_CODE (~(dma_addr_t)0x0) - -static inline int pci_dma_mapping_error(struct pci_dev *pdev, - dma_addr_t dma_addr) -{ - return (dma_addr == PCI_DMA_ERROR_CODE); -} - struct device_node; extern struct device_node *pci_device_to_OF_node(struct pci_dev *pdev); diff --git a/arch/sparc/include/asm/pci_64.h b/arch/sparc/include/asm/pci_64.h index 7a1e3566e59c..5cc9f6aa5494 100644 --- a/arch/sparc/include/asm/pci_64.h +++ b/arch/sparc/include/asm/pci_64.h @@ -35,37 +35,6 @@ static inline void pcibios_penalize_isa_irq(int irq, int active) */ #define PCI_DMA_BUS_IS_PHYS (0) -static inline void *pci_alloc_consistent(struct pci_dev *pdev, size_t size, - dma_addr_t *dma_handle) -{ - return dma_alloc_coherent(&pdev->dev, size, dma_handle, GFP_ATOMIC); -} - -static inline void pci_free_consistent(struct pci_dev *pdev, size_t size, - void *vaddr, dma_addr_t dma_handle) -{ - return dma_free_coherent(&pdev->dev, size, vaddr, dma_handle); -} - -static inline dma_addr_t pci_map_single(struct pci_dev *pdev, void *ptr, - size_t size, int direction) -{ - return dma_map_single(&pdev->dev, ptr, size, - (enum dma_data_direction) direction); -} - -static inline void pci_unmap_single(struct pci_dev *pdev, dma_addr_t dma_addr, - size_t size, int direction) -{ - dma_unmap_single(&pdev->dev, dma_addr, size, - (enum dma_data_direction) direction); -} - -#define pci_map_page(dev, page, off, size, dir) \ - pci_map_single(dev, (page_address(page) + (off)), size, dir) -#define pci_unmap_page(dev,addr,sz,dir) \ - pci_unmap_single(dev,addr,sz,dir) - /* pci_unmap_{single,page} is not a nop, thus... */ #define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \ dma_addr_t ADDR_NAME; @@ -80,57 +49,6 @@ static inline void pci_unmap_single(struct pci_dev *pdev, dma_addr_t dma_addr, #define pci_unmap_len_set(PTR, LEN_NAME, VAL) \ (((PTR)->LEN_NAME) = (VAL)) -static inline int pci_map_sg(struct pci_dev *pdev, struct scatterlist *sg, - int nents, int direction) -{ - return dma_map_sg(&pdev->dev, sg, nents, - (enum dma_data_direction) direction); -} - -static inline void pci_unmap_sg(struct pci_dev *pdev, struct scatterlist *sg, - int nents, int direction) -{ - dma_unmap_sg(&pdev->dev, sg, nents, - (enum dma_data_direction) direction); -} - -static inline void pci_dma_sync_single_for_cpu(struct pci_dev *pdev, - dma_addr_t dma_handle, - size_t size, int direction) -{ - dma_sync_single_for_cpu(&pdev->dev, dma_handle, size, - (enum dma_data_direction) direction); -} - -static inline void pci_dma_sync_single_for_device(struct pci_dev *pdev, - dma_addr_t dma_handle, - size_t size, int direction) -{ - /* No flushing needed to sync cpu writes to the device. */ -} - -static inline void pci_dma_sync_sg_for_cpu(struct pci_dev *pdev, - struct scatterlist *sg, - int nents, int direction) -{ - dma_sync_sg_for_cpu(&pdev->dev, sg, nents, - (enum dma_data_direction) direction); -} - -static inline void pci_dma_sync_sg_for_device(struct pci_dev *pdev, - struct scatterlist *sg, - int nelems, int direction) -{ - /* No flushing needed to sync cpu writes to the device. */ -} - -/* Return whether the given PCI device DMA address mask can - * be supported properly. For example, if your device can - * only drive the low 24-bits during PCI bus mastering, then - * you would pass 0x00ffffff as the mask to this function. - */ -extern int pci_dma_supported(struct pci_dev *hwdev, u64 mask); - /* PCI IOMMU mapping bypass support. */ /* PCI 64-bit addressing works for all slots on all controller @@ -140,12 +58,6 @@ extern int pci_dma_supported(struct pci_dev *hwdev, u64 mask); #define PCI64_REQUIRED_MASK (~(dma64_addr_t)0) #define PCI64_ADDR_BASE 0xfffc000000000000UL -static inline int pci_dma_mapping_error(struct pci_dev *pdev, - dma_addr_t dma_addr) -{ - return dma_mapping_error(&pdev->dev, dma_addr); -} - #ifdef CONFIG_PCI static inline void pci_dma_burst_advice(struct pci_dev *pdev, enum pci_dma_burst_strategy *strat, diff --git a/arch/sparc/kernel/dma.c b/arch/sparc/kernel/dma.c index a5d50dac735c..b2fa3127f605 100644 --- a/arch/sparc/kernel/dma.c +++ b/arch/sparc/kernel/dma.c @@ -13,13 +13,17 @@ #include #endif -#include "dma.h" - +/* + * Return whether the given PCI device DMA address mask can be + * supported properly. For example, if your device can only drive the + * low 24-bits during PCI bus mastering, then you would pass + * 0x00ffffff as the mask to this function. + */ int dma_supported(struct device *dev, u64 mask) { #ifdef CONFIG_PCI if (dev->bus == &pci_bus_type) - return pci_dma_supported(to_pci_dev(dev), mask); + return 1; #endif return 0; } @@ -34,148 +38,3 @@ int dma_set_mask(struct device *dev, u64 dma_mask) return -EOPNOTSUPP; } EXPORT_SYMBOL(dma_set_mask); - -static void *dma32_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag) -{ -#ifdef CONFIG_PCI - if (dev->bus == &pci_bus_type) - return pci_alloc_consistent(to_pci_dev(dev), size, dma_handle); -#endif - return sbus_alloc_consistent(dev, size, dma_handle); -} - -static void dma32_free_coherent(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t dma_handle) -{ -#ifdef CONFIG_PCI - if (dev->bus == &pci_bus_type) { - pci_free_consistent(to_pci_dev(dev), size, - cpu_addr, dma_handle); - return; - } -#endif - sbus_free_consistent(dev, size, cpu_addr, dma_handle); -} - -static dma_addr_t dma32_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction direction, - struct dma_attrs *attrs) -{ -#ifdef CONFIG_PCI - if (dev->bus == &pci_bus_type) - return pci_map_page(to_pci_dev(dev), page, offset, - size, (int)direction); -#endif - return sbus_map_page(dev, page, offset, size, (int)direction); -} - -static void dma32_unmap_page(struct device *dev, dma_addr_t dma_address, - size_t size, enum dma_data_direction direction, - struct dma_attrs *attrs) -{ -#ifdef CONFIG_PCI - if (dev->bus == &pci_bus_type) { - pci_unmap_page(to_pci_dev(dev), dma_address, - size, (int)direction); - return; - } -#endif - sbus_unmap_page(dev, dma_address, size, (int)direction); -} - -static int dma32_map_sg(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction direction, - struct dma_attrs *attrs) -{ -#ifdef CONFIG_PCI - if (dev->bus == &pci_bus_type) - return pci_map_sg(to_pci_dev(dev), sg, nents, (int)direction); -#endif - return sbus_map_sg(dev, sg, nents, direction); -} - -void dma32_unmap_sg(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction direction, - struct dma_attrs *attrs) -{ -#ifdef CONFIG_PCI - if (dev->bus == &pci_bus_type) { - pci_unmap_sg(to_pci_dev(dev), sg, nents, (int)direction); - return; - } -#endif - sbus_unmap_sg(dev, sg, nents, (int)direction); -} - -static void dma32_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, - size_t size, - enum dma_data_direction direction) -{ -#ifdef CONFIG_PCI - if (dev->bus == &pci_bus_type) { - pci_dma_sync_single_for_cpu(to_pci_dev(dev), dma_handle, - size, (int)direction); - return; - } -#endif - sbus_dma_sync_single_for_cpu(dev, dma_handle, size, (int) direction); -} - -static void dma32_sync_single_for_device(struct device *dev, - dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ -#ifdef CONFIG_PCI - if (dev->bus == &pci_bus_type) { - pci_dma_sync_single_for_device(to_pci_dev(dev), dma_handle, - size, (int)direction); - return; - } -#endif - sbus_dma_sync_single_for_device(dev, dma_handle, size, (int) direction); -} - -static void dma32_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, - int nelems, enum dma_data_direction direction) -{ -#ifdef CONFIG_PCI - if (dev->bus == &pci_bus_type) { - pci_dma_sync_sg_for_cpu(to_pci_dev(dev), sg, - nelems, (int)direction); - return; - } -#endif - BUG(); -} - -static void dma32_sync_sg_for_device(struct device *dev, - struct scatterlist *sg, int nelems, - enum dma_data_direction direction) -{ -#ifdef CONFIG_PCI - if (dev->bus == &pci_bus_type) { - pci_dma_sync_sg_for_device(to_pci_dev(dev), sg, - nelems, (int)direction); - return; - } -#endif - BUG(); -} - -static struct dma_map_ops dma32_dma_ops = { - .alloc_coherent = dma32_alloc_coherent, - .free_coherent = dma32_free_coherent, - .map_page = dma32_map_page, - .unmap_page = dma32_unmap_page, - .map_sg = dma32_map_sg, - .unmap_sg = dma32_unmap_sg, - .sync_single_for_cpu = dma32_sync_single_for_cpu, - .sync_single_for_device = dma32_sync_single_for_device, - .sync_sg_for_cpu = dma32_sync_sg_for_cpu, - .sync_sg_for_device = dma32_sync_sg_for_device, -}; - -struct dma_map_ops *dma_ops = &dma32_dma_ops; -EXPORT_SYMBOL(dma_ops); diff --git a/arch/sparc/kernel/dma.h b/arch/sparc/kernel/dma.h deleted file mode 100644 index 680351ee0d40..000000000000 --- a/arch/sparc/kernel/dma.h +++ /dev/null @@ -1,14 +0,0 @@ -void *sbus_alloc_consistent(struct device *dev, long len, u32 *dma_addrp); -void sbus_free_consistent(struct device *dev, long n, void *p, u32 ba); -dma_addr_t sbus_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t len, int direction); -void sbus_unmap_page(struct device *dev, dma_addr_t ba, - size_t n, int direction); -int sbus_map_sg(struct device *dev, struct scatterlist *sg, - int n, int direction); -void sbus_unmap_sg(struct device *dev, struct scatterlist *sg, - int n, int direction); -void sbus_dma_sync_single_for_cpu(struct device *dev, dma_addr_t ba, - size_t size, int direction); -void sbus_dma_sync_single_for_device(struct device *dev, dma_addr_t ba, - size_t size, int direction); diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c index 74b289cab552..7690cc219ecc 100644 --- a/arch/sparc/kernel/iommu.c +++ b/arch/sparc/kernel/iommu.c @@ -840,6 +840,8 @@ static struct dma_map_ops sun4u_dma_ops = { struct dma_map_ops *dma_ops = &sun4u_dma_ops; EXPORT_SYMBOL(dma_ops); +extern int pci64_dma_supported(struct pci_dev *pdev, u64 device_mask); + int dma_supported(struct device *dev, u64 device_mask) { struct iommu *iommu = dev->archdata.iommu; @@ -853,7 +855,7 @@ int dma_supported(struct device *dev, u64 device_mask) #ifdef CONFIG_PCI if (dev->bus == &pci_bus_type) - return pci_dma_supported(to_pci_dev(dev), device_mask); + return pci64_dma_supported(to_pci_dev(dev), device_mask); #endif return 0; diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c index 39ff1e0c5183..1eb604389655 100644 --- a/arch/sparc/kernel/ioport.c +++ b/arch/sparc/kernel/ioport.c @@ -48,8 +48,6 @@ #include #include -#include "dma.h" - #define mmu_inval_dma_area(p, l) /* Anton pulled it out for 2.4.0-xx */ static struct resource *_sparc_find_resource(struct resource *r, @@ -246,7 +244,8 @@ EXPORT_SYMBOL(sbus_set_sbus64); * Typically devices use them for control blocks. * CPU may access them without any explicit flushing. */ -void *sbus_alloc_consistent(struct device *dev, long len, u32 *dma_addrp) +static void *sbus_alloc_coherent(struct device *dev, size_t len, + dma_addr_t *dma_addrp, gfp_t gfp) { struct of_device *op = to_of_device(dev); unsigned long len_total = (len + PAGE_SIZE-1) & PAGE_MASK; @@ -299,7 +298,8 @@ err_nopages: return NULL; } -void sbus_free_consistent(struct device *dev, long n, void *p, u32 ba) +static void sbus_free_coherent(struct device *dev, size_t n, void *p, + dma_addr_t ba) { struct resource *res; struct page *pgv; @@ -317,7 +317,7 @@ void sbus_free_consistent(struct device *dev, long n, void *p, u32 ba) n = (n + PAGE_SIZE-1) & PAGE_MASK; if ((res->end-res->start)+1 != n) { - printk("sbus_free_consistent: region 0x%lx asked 0x%lx\n", + printk("sbus_free_consistent: region 0x%lx asked 0x%zx\n", (long)((res->end-res->start)+1), n); return; } @@ -337,8 +337,10 @@ void sbus_free_consistent(struct device *dev, long n, void *p, u32 ba) * CPU view of this memory may be inconsistent with * a device view and explicit flushing is necessary. */ -dma_addr_t sbus_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t len, int direction) +static dma_addr_t sbus_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t len, + enum dma_data_direction dir, + struct dma_attrs *attrs) { void *va = page_address(page) + offset; @@ -353,12 +355,14 @@ dma_addr_t sbus_map_page(struct device *dev, struct page *page, return mmu_get_scsi_one(dev, va, len); } -void sbus_unmap_page(struct device *dev, dma_addr_t ba, size_t n, int direction) +static void sbus_unmap_page(struct device *dev, dma_addr_t ba, size_t n, + enum dma_data_direction dir, struct dma_attrs *attrs) { mmu_release_scsi_one(dev, ba, n); } -int sbus_map_sg(struct device *dev, struct scatterlist *sg, int n, int direction) +static int sbus_map_sg(struct device *dev, struct scatterlist *sg, int n, + enum dma_data_direction dir, struct dma_attrs *attrs) { mmu_get_scsi_sgl(dev, sg, n); @@ -369,19 +373,38 @@ int sbus_map_sg(struct device *dev, struct scatterlist *sg, int n, int direction return n; } -void sbus_unmap_sg(struct device *dev, struct scatterlist *sg, int n, int direction) +static void sbus_unmap_sg(struct device *dev, struct scatterlist *sg, int n, + enum dma_data_direction dir, struct dma_attrs *attrs) { mmu_release_scsi_sgl(dev, sg, n); } -void sbus_dma_sync_single_for_cpu(struct device *dev, dma_addr_t ba, size_t size, int direction) +static void sbus_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, + int n, enum dma_data_direction dir) { + BUG(); } -void sbus_dma_sync_single_for_device(struct device *dev, dma_addr_t ba, size_t size, int direction) +static void sbus_sync_sg_for_device(struct device *dev, struct scatterlist *sg, + int n, enum dma_data_direction dir) { + BUG(); } +struct dma_map_ops sbus_dma_ops = { + .alloc_coherent = sbus_alloc_coherent, + .free_coherent = sbus_free_coherent, + .map_page = sbus_map_page, + .unmap_page = sbus_unmap_page, + .map_sg = sbus_map_sg, + .unmap_sg = sbus_unmap_sg, + .sync_sg_for_cpu = sbus_sync_sg_for_cpu, + .sync_sg_for_device = sbus_sync_sg_for_device, +}; + +struct dma_map_ops *dma_ops = &sbus_dma_ops; +EXPORT_SYMBOL(dma_ops); + static int __init sparc_register_ioport(void) { register_proc_sparc_ioport(); @@ -398,7 +421,8 @@ arch_initcall(sparc_register_ioport); /* Allocate and map kernel buffer using consistent mode DMA for a device. * hwdev should be valid struct pci_dev pointer for PCI devices. */ -void *pci_alloc_consistent(struct pci_dev *pdev, size_t len, dma_addr_t *pba) +static void *pci32_alloc_coherent(struct device *dev, size_t len, + dma_addr_t *pba, gfp_t gfp) { unsigned long len_total = (len + PAGE_SIZE-1) & PAGE_MASK; unsigned long va; @@ -442,7 +466,6 @@ void *pci_alloc_consistent(struct pci_dev *pdev, size_t len, dma_addr_t *pba) *pba = virt_to_phys(va); /* equals virt_to_bus (R.I.P.) for us. */ return (void *) res->start; } -EXPORT_SYMBOL(pci_alloc_consistent); /* Free and unmap a consistent DMA buffer. * cpu_addr is what was returned from pci_alloc_consistent, @@ -452,7 +475,8 @@ EXPORT_SYMBOL(pci_alloc_consistent); * References to the memory and mappings associated with cpu_addr/dma_addr * past this call are illegal. */ -void pci_free_consistent(struct pci_dev *pdev, size_t n, void *p, dma_addr_t ba) +static void pci32_free_coherent(struct device *dev, size_t n, void *p, + dma_addr_t ba) { struct resource *res; unsigned long pgp; @@ -484,60 +508,18 @@ void pci_free_consistent(struct pci_dev *pdev, size_t n, void *p, dma_addr_t ba) free_pages(pgp, get_order(n)); } -EXPORT_SYMBOL(pci_free_consistent); - -/* Map a single buffer of the indicated size for DMA in streaming mode. - * The 32-bit bus address to use is returned. - * - * Once the device is given the dma address, the device owns this memory - * until either pci_unmap_single or pci_dma_sync_single_* is performed. - */ -dma_addr_t pci_map_single(struct pci_dev *hwdev, void *ptr, size_t size, - int direction) -{ - BUG_ON(direction == PCI_DMA_NONE); - /* IIep is write-through, not flushing. */ - return virt_to_phys(ptr); -} -EXPORT_SYMBOL(pci_map_single); - -/* Unmap a single streaming mode DMA translation. The dma_addr and size - * must match what was provided for in a previous pci_map_single call. All - * other usages are undefined. - * - * After this call, reads by the cpu to the buffer are guaranteed to see - * whatever the device wrote there. - */ -void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t ba, size_t size, - int direction) -{ - BUG_ON(direction == PCI_DMA_NONE); - if (direction != PCI_DMA_TODEVICE) { - mmu_inval_dma_area((unsigned long)phys_to_virt(ba), - (size + PAGE_SIZE-1) & PAGE_MASK); - } -} -EXPORT_SYMBOL(pci_unmap_single); /* * Same as pci_map_single, but with pages. */ -dma_addr_t pci_map_page(struct pci_dev *hwdev, struct page *page, - unsigned long offset, size_t size, int direction) +static dma_addr_t pci32_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + struct dma_attrs *attrs) { - BUG_ON(direction == PCI_DMA_NONE); /* IIep is write-through, not flushing. */ return page_to_phys(page) + offset; } -EXPORT_SYMBOL(pci_map_page); - -void pci_unmap_page(struct pci_dev *hwdev, - dma_addr_t dma_address, size_t size, int direction) -{ - BUG_ON(direction == PCI_DMA_NONE); - /* mmu_inval_dma_area XXX */ -} -EXPORT_SYMBOL(pci_unmap_page); /* Map a set of buffers described by scatterlist in streaming * mode for DMA. This is the scather-gather version of the @@ -554,13 +536,13 @@ EXPORT_SYMBOL(pci_unmap_page); * Device ownership issues as mentioned above for pci_map_single are * the same here. */ -int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sgl, int nents, - int direction) +static int pci32_map_sg(struct device *device, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, + struct dma_attrs *attrs) { struct scatterlist *sg; int n; - BUG_ON(direction == PCI_DMA_NONE); /* IIep is write-through, not flushing. */ for_each_sg(sgl, sg, nents, n) { BUG_ON(page_address(sg_page(sg)) == NULL); @@ -569,20 +551,19 @@ int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sgl, int nents, } return nents; } -EXPORT_SYMBOL(pci_map_sg); /* Unmap a set of streaming mode DMA translations. * Again, cpu read rules concerning calls here are the same as for * pci_unmap_single() above. */ -void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sgl, int nents, - int direction) +static void pci32_unmap_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, + struct dma_attrs *attrs) { struct scatterlist *sg; int n; - BUG_ON(direction == PCI_DMA_NONE); - if (direction != PCI_DMA_TODEVICE) { + if (dir != PCI_DMA_TODEVICE) { for_each_sg(sgl, sg, nents, n) { BUG_ON(page_address(sg_page(sg)) == NULL); mmu_inval_dma_area( @@ -591,7 +572,6 @@ void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sgl, int nents, } } } -EXPORT_SYMBOL(pci_unmap_sg); /* Make physical memory consistent for a single * streaming mode DMA translation before or after a transfer. @@ -603,25 +583,23 @@ EXPORT_SYMBOL(pci_unmap_sg); * must first perform a pci_dma_sync_for_device, and then the * device again owns the buffer. */ -void pci_dma_sync_single_for_cpu(struct pci_dev *hwdev, dma_addr_t ba, size_t size, int direction) +static void pci32_sync_single_for_cpu(struct device *dev, dma_addr_t ba, + size_t size, enum dma_data_direction dir) { - BUG_ON(direction == PCI_DMA_NONE); - if (direction != PCI_DMA_TODEVICE) { + if (dir != PCI_DMA_TODEVICE) { mmu_inval_dma_area((unsigned long)phys_to_virt(ba), (size + PAGE_SIZE-1) & PAGE_MASK); } } -EXPORT_SYMBOL(pci_dma_sync_single_for_cpu); -void pci_dma_sync_single_for_device(struct pci_dev *hwdev, dma_addr_t ba, size_t size, int direction) +static void pci32_sync_single_for_device(struct device *dev, dma_addr_t ba, + size_t size, enum dma_data_direction dir) { - BUG_ON(direction == PCI_DMA_NONE); - if (direction != PCI_DMA_TODEVICE) { + if (dir != PCI_DMA_TODEVICE) { mmu_inval_dma_area((unsigned long)phys_to_virt(ba), (size + PAGE_SIZE-1) & PAGE_MASK); } } -EXPORT_SYMBOL(pci_dma_sync_single_for_device); /* Make physical memory consistent for a set of streaming * mode DMA translations after a transfer. @@ -629,13 +607,13 @@ EXPORT_SYMBOL(pci_dma_sync_single_for_device); * The same as pci_dma_sync_single_* but for a scatter-gather list, * same rules and usage. */ -void pci_dma_sync_sg_for_cpu(struct pci_dev *hwdev, struct scatterlist *sgl, int nents, int direction) +static void pci32_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir) { struct scatterlist *sg; int n; - BUG_ON(direction == PCI_DMA_NONE); - if (direction != PCI_DMA_TODEVICE) { + if (dir != PCI_DMA_TODEVICE) { for_each_sg(sgl, sg, nents, n) { BUG_ON(page_address(sg_page(sg)) == NULL); mmu_inval_dma_area( @@ -644,15 +622,14 @@ void pci_dma_sync_sg_for_cpu(struct pci_dev *hwdev, struct scatterlist *sgl, int } } } -EXPORT_SYMBOL(pci_dma_sync_sg_for_cpu); -void pci_dma_sync_sg_for_device(struct pci_dev *hwdev, struct scatterlist *sgl, int nents, int direction) +static void pci32_sync_sg_for_device(struct device *device, struct scatterlist *sgl, + int nents, enum dma_data_direction dir) { struct scatterlist *sg; int n; - BUG_ON(direction == PCI_DMA_NONE); - if (direction != PCI_DMA_TODEVICE) { + if (dir != PCI_DMA_TODEVICE) { for_each_sg(sgl, sg, nents, n) { BUG_ON(page_address(sg_page(sg)) == NULL); mmu_inval_dma_area( @@ -661,7 +638,20 @@ void pci_dma_sync_sg_for_device(struct pci_dev *hwdev, struct scatterlist *sgl, } } } -EXPORT_SYMBOL(pci_dma_sync_sg_for_device); + +struct dma_map_ops pci32_dma_ops = { + .alloc_coherent = pci32_alloc_coherent, + .free_coherent = pci32_free_coherent, + .map_page = pci32_map_page, + .map_sg = pci32_map_sg, + .unmap_sg = pci32_unmap_sg, + .sync_single_for_cpu = pci32_sync_single_for_cpu, + .sync_single_for_device = pci32_sync_single_for_device, + .sync_sg_for_cpu = pci32_sync_sg_for_cpu, + .sync_sg_for_device = pci32_sync_sg_for_device, +}; +EXPORT_SYMBOL(pci32_dma_ops); + #endif /* CONFIG_PCI */ #ifdef CONFIG_PROC_FS diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c index 57859ad23547..c68648662802 100644 --- a/arch/sparc/kernel/pci.c +++ b/arch/sparc/kernel/pci.c @@ -1039,7 +1039,7 @@ static void ali_sound_dma_hack(struct pci_dev *pdev, int set_bit) pci_dev_put(ali_isa_bridge); } -int pci_dma_supported(struct pci_dev *pdev, u64 device_mask) +int pci64_dma_supported(struct pci_dev *pdev, u64 device_mask) { u64 dma_addr_mask; -- cgit v1.2.3 From 451d7400a34cb679369e337d67f0238ed410f484 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 10 Aug 2009 11:53:17 +0900 Subject: sparc: Add CONFIG_DMA_API_DEBUG support All we need to do for CONFIG_DMA_API_DEBUG support is call dma_debug_init() in DMA code common for SPARC32 and SPARC64. Now SPARC32 uses two dma_map_ops structures for pci and sbus so there is not much dma stuff for SPARC32 in kernel/dma.c. kernel/ioport.c also includes dma stuff for SPARC32. So let's put all the dma stuff for SPARC32 in kernel/ioport.c and make kernel/dma.c common for SPARC32 and SPARC64. Signed-off-by: FUJITA Tomonori Tested-by: Robert Reif Acked-by: David S. Miller Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com LKML-Reference: <1249872797-1314-9-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/sparc/Kconfig | 1 + arch/sparc/include/asm/dma-mapping.h | 6 +++++- arch/sparc/kernel/Makefile | 2 +- arch/sparc/kernel/dma.c | 37 +++++------------------------------- arch/sparc/kernel/ioport.c | 27 ++++++++++++++++++++++++++ 5 files changed, 39 insertions(+), 34 deletions(-) diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 5f2df99645c9..233cff53a623 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -26,6 +26,7 @@ config SPARC select RTC_CLASS select RTC_DRV_M48T59 select HAVE_DMA_ATTRS + select HAVE_DMA_API_DEBUG config SPARC32 def_bool !64BIT diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h index 2677818dc781..5a8c308e2b5c 100644 --- a/arch/sparc/include/asm/dma-mapping.h +++ b/arch/sparc/include/asm/dma-mapping.h @@ -32,8 +32,11 @@ static inline void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag) { struct dma_map_ops *ops = get_dma_ops(dev); + void *cpu_addr; - return ops->alloc_coherent(dev, size, dma_handle, flag); + cpu_addr = ops->alloc_coherent(dev, size, dma_handle, flag); + debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr); + return cpu_addr; } static inline void dma_free_coherent(struct device *dev, size_t size, @@ -41,6 +44,7 @@ static inline void dma_free_coherent(struct device *dev, size_t size, { struct dma_map_ops *ops = get_dma_ops(dev); + debug_dma_free_coherent(dev, size, cpu_addr, dma_handle); ops->free_coherent(dev, size, cpu_addr, dma_handle); } diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile index 475ce4696acd..29b88a580661 100644 --- a/arch/sparc/kernel/Makefile +++ b/arch/sparc/kernel/Makefile @@ -61,7 +61,7 @@ obj-$(CONFIG_SPARC64_SMP) += cpumap.o obj-$(CONFIG_SPARC32) += devres.o devres-y := ../../../kernel/irq/devres.o -obj-$(CONFIG_SPARC32) += dma.o +obj-y += dma.o obj-$(CONFIG_SPARC32_PCI) += pcic.o diff --git a/arch/sparc/kernel/dma.c b/arch/sparc/kernel/dma.c index b2fa3127f605..e1ba8ee21b9a 100644 --- a/arch/sparc/kernel/dma.c +++ b/arch/sparc/kernel/dma.c @@ -1,40 +1,13 @@ -/* dma.c: PCI and SBUS DMA accessors for 32-bit sparc. - * - * Copyright (C) 2008 David S. Miller - */ - #include #include #include -#include -#include +#include -#ifdef CONFIG_PCI -#include -#endif +#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 15) -/* - * Return whether the given PCI device DMA address mask can be - * supported properly. For example, if your device can only drive the - * low 24-bits during PCI bus mastering, then you would pass - * 0x00ffffff as the mask to this function. - */ -int dma_supported(struct device *dev, u64 mask) +static int __init dma_init(void) { -#ifdef CONFIG_PCI - if (dev->bus == &pci_bus_type) - return 1; -#endif + dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); return 0; } -EXPORT_SYMBOL(dma_supported); - -int dma_set_mask(struct device *dev, u64 dma_mask) -{ -#ifdef CONFIG_PCI - if (dev->bus == &pci_bus_type) - return pci_set_dma_mask(to_pci_dev(dev), dma_mask); -#endif - return -EOPNOTSUPP; -} -EXPORT_SYMBOL(dma_set_mask); +fs_initcall(dma_init); diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c index 1eb604389655..edbea232c617 100644 --- a/arch/sparc/kernel/ioport.c +++ b/arch/sparc/kernel/ioport.c @@ -654,6 +654,33 @@ EXPORT_SYMBOL(pci32_dma_ops); #endif /* CONFIG_PCI */ +/* + * Return whether the given PCI device DMA address mask can be + * supported properly. For example, if your device can only drive the + * low 24-bits during PCI bus mastering, then you would pass + * 0x00ffffff as the mask to this function. + */ +int dma_supported(struct device *dev, u64 mask) +{ +#ifdef CONFIG_PCI + if (dev->bus == &pci_bus_type) + return 1; +#endif + return 0; +} +EXPORT_SYMBOL(dma_supported); + +int dma_set_mask(struct device *dev, u64 dma_mask) +{ +#ifdef CONFIG_PCI + if (dev->bus == &pci_bus_type) + return pci_set_dma_mask(to_pci_dev(dev), dma_mask); +#endif + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(dma_set_mask); + + #ifdef CONFIG_PROC_FS static int -- cgit v1.2.3 From beda2c7ea2c15ed01eef00a997d2b0496c3a502d Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Sun, 9 Aug 2009 15:34:39 -0700 Subject: futex: Update futex_q lock_ptr on requeue proxy lock futex_requeue() can acquire the lock on behalf of a waiter early on or during the requeue loop if it is uncontended or in the event of a lock steal or owner died. On wakeup, the waiter (in futex_wait_requeue_pi()) cleans up the pi_state owner using the lock_ptr to protect against concurrent access to the pi_state. The pi_state is hung off futex_q's on the requeue target futex hash bucket so the lock_ptr needs to be updated accordingly. The problem manifested by triggering the WARN_ON in lookup_pi_state() about the pid != pi_state->owner->pid. With this patch, the pi_state is properly guarded against concurrent access via the requeue target hb lock. The astute reviewer may notice that there is a window of time between when futex_requeue() unlocks the hb locks and when futex_wait_requeue_pi() will acquire hb2->lock. During this time the pi_state and uval are not in sync with the underlying rtmutex owner (but the uval does indicate there are waiters, so no atomic changes will occur in userspace). However, this is not a problem. Should a contending thread enter lookup_pi_state() and acquire hb2->lock before the ownership is fixed up, it will find the pi_state hung off a waiter's (possibly the pending owner's) futex_q and block on the rtmutex. Once futex_wait_requeue_pi() fixes up the owner, it will also move the pi_state from the old owner's task->pi_state_list to its own. v3: Fix plist lock name for application to mainline (rather than -rt) Compile tested against tip/v2.6.31-rc5. Signed-off-by: Darren Hart Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Eric Dumazet Cc: Dinakar Guniguntala Cc: John Stultz LKML-Reference: <4A7F4EFF.6090903@us.ibm.com> Signed-off-by: Ingo Molnar --- kernel/futex.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 0672ff88f159..8cc3ee1363a0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1010,15 +1010,19 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue * q: the futex_q * key: the key of the requeue target futex + * hb: the hash_bucket of the requeue target futex * * During futex_requeue, with requeue_pi=1, it is possible to acquire the * target futex if it is uncontended or via a lock steal. Set the futex_q key * to the requeue target futex so the waiter can detect the wakeup on the right * futex, but remove it from the hb and NULL the rt_waiter so it can detect - * atomic lock acquisition. Must be called with the q->lock_ptr held. + * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock + * to protect access to the pi_state to fixup the owner later. Must be called + * with both q->lock_ptr and hb->lock held. */ static inline -void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) +void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, + struct futex_hash_bucket *hb) { drop_futex_key_refs(&q->key); get_futex_key_refs(key); @@ -1030,6 +1034,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) WARN_ON(!q->rt_waiter); q->rt_waiter = NULL; + q->lock_ptr = &hb->lock; +#ifdef CONFIG_DEBUG_PI_LIST + q->list.plist.lock = &hb->lock; +#endif + wake_up_state(q->task, TASK_NORMAL); } @@ -1088,7 +1097,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, set_waiters); if (ret == 1) - requeue_pi_wake_futex(top_waiter, key2); + requeue_pi_wake_futex(top_waiter, key2, hb2); return ret; } @@ -1273,7 +1282,7 @@ retry_private: this->task, 1); if (ret == 1) { /* We got the lock. */ - requeue_pi_wake_futex(this, &key2); + requeue_pi_wake_futex(this, &key2, hb2); continue; } else if (ret) { /* -EDEADLK */ -- cgit v1.2.3 From c7425314c755d5f94da7c978205c85a7c6201212 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sun, 9 Aug 2009 17:03:52 +0900 Subject: x86: Introduce GDT_ENTRY_INIT(), initialize bad_bios_desc statically Fully initialize bad_bios_desc statically instead of doing some fields statically and some dynamically. Suggested-by: "H. Peter Anvin" Signed-off-by: Akinobu Mita LKML-Reference: <20090809080350.GA4765@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apm_32.c | 19 +++++++++---------- drivers/pnp/pnpbios/bioscalls.c | 5 ++--- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index febb2dab254f..39a4462ef8a7 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -403,7 +403,15 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); static struct apm_user *user_list; static DEFINE_SPINLOCK(user_list_lock); -static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092, 0, 0); + +/* + * Set up a segment that references the real mode segment 0x40 + * that extends up to the end of page zero (that we have reserved). + * This is for buggy BIOS's that refer to (real mode) segment 0x40 + * even though they are called in protected mode. + */ +static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092, + (unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1); static const char driver_version[] = "1.16ac"; /* no spaces */ @@ -2331,15 +2339,6 @@ static int __init apm_init(void) } pm_flags |= PM_APM; - /* - * Set up a segment that references the real mode segment 0x40 - * that extends up to the end of page zero (that we have reserved). - * This is for buggy BIOS's that refer to (real mode) segment 0x40 - * even though they are called in protected mode. - */ - set_desc_base(&bad_bios_desc, (unsigned long)__va(0x40UL << 4)); - set_desc_limit(&bad_bios_desc, 4095 - (0x40 << 4)); - /* * Set up the long jump entry point to the APM BIOS, which is called * from inline assembly. diff --git a/drivers/pnp/pnpbios/bioscalls.c b/drivers/pnp/pnpbios/bioscalls.c index bd035e3d3550..fc83783c3a96 100644 --- a/drivers/pnp/pnpbios/bioscalls.c +++ b/drivers/pnp/pnpbios/bioscalls.c @@ -60,7 +60,8 @@ do { \ set_desc_limit(&gdt[(selname) >> 3], (size) - 1); \ } while(0) -static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092, 0, 0); +static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092, + (unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1); /* * At some point we want to use this stack frame pointer to unwind @@ -476,8 +477,6 @@ void pnpbios_calls_init(union pnp_bios_install_struct *header) pnp_bios_callpoint.offset = header->fields.pm16offset; pnp_bios_callpoint.segment = PNP_CS16; - set_desc_base(&bad_bios_desc, (unsigned long)__va(0x40UL << 4)); - set_desc_limit(&bad_bios_desc, 4095 - (0x40 << 4)); for_each_possible_cpu(i) { struct desc_struct *gdt = get_cpu_gdt_table(i); if (!gdt) -- cgit v1.2.3 From 2fc391112fb6f3424435a3aa2fda887497b5f807 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Aug 2009 12:33:05 +0100 Subject: locking, sched: Give waitqueue spinlocks their own lockdep classes Give waitqueue spinlocks their own lockdep classes when they are initialised from init_waitqueue_head(). This means that struct wait_queue::func functions can operate other waitqueues. This is used by CacheFiles to catch the page from a backing fs being unlocked and to wake up another thread to take a copy of it. Signed-off-by: Peter Zijlstra Signed-off-by: David Howells Tested-by: Takashi Iwai Cc: linux-cachefs@redhat.com Cc: torvalds@osdl.org Cc: akpm@linux-foundation.org LKML-Reference: <20090810113305.17284.81508.stgit@warthog.procyon.org.uk> Signed-off-by: Ingo Molnar --- include/linux/wait.h | 9 ++++++++- kernel/wait.c | 5 +++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/include/linux/wait.h b/include/linux/wait.h index 6788e1a4d4ca..cf3c2f5dba51 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -77,7 +77,14 @@ struct task_struct; #define __WAIT_BIT_KEY_INITIALIZER(word, bit) \ { .flags = word, .bit_nr = bit, } -extern void init_waitqueue_head(wait_queue_head_t *q); +extern void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *); + +#define init_waitqueue_head(q) \ + do { \ + static struct lock_class_key __key; \ + \ + __init_waitqueue_head((q), &__key); \ + } while (0) #ifdef CONFIG_LOCKDEP # define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \ diff --git a/kernel/wait.c b/kernel/wait.c index ea7c3b4275cf..c4bd3d825f35 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -10,13 +10,14 @@ #include #include -void init_waitqueue_head(wait_queue_head_t *q) +void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key) { spin_lock_init(&q->lock); + lockdep_set_class(&q->lock, key); INIT_LIST_HEAD(&q->task_list); } -EXPORT_SYMBOL(init_waitqueue_head); +EXPORT_SYMBOL(__init_waitqueue_head); void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) { -- cgit v1.2.3 From 4dc88029fd916b860ef063c40180aa604ce93494 Mon Sep 17 00:00:00 2001 From: Dinakar Guniguntala Date: Mon, 10 Aug 2009 18:31:42 +0530 Subject: futex: Fix compat_futex to be same as futex for REQUEUE_PI Need to add the REQUEUE_PI checks to the compat_sys_futex API as well to ensure 32 bit requeue's work fine on a 64 bit system. Patch is against latest tip Signed-off-by: Dinakar Guniguntala Cc: Darren Hart LKML-Reference: <20090810130142.GA23619@in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/futex_compat.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index d607a5b9ee29..235716556bf1 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -180,7 +180,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, int cmd = op & FUTEX_CMD_MASK; if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET)) { + cmd == FUTEX_WAIT_BITSET || + cmd == FUTEX_WAIT_REQUEUE_PI)) { if (get_compat_timespec(&ts, utime)) return -EFAULT; if (!timespec_valid(&ts)) @@ -191,7 +192,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, t = ktime_add_safe(ktime_get(), t); tp = &t; } - if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) + if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || + cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) val2 = (int) (unsigned long) utime; return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); -- cgit v1.2.3 From 9f8666971185b86615a074bcac67c90fdf8af8bc Mon Sep 17 00:00:00 2001 From: Brice Goglin Date: Mon, 10 Aug 2009 15:26:32 +0200 Subject: perf report: Add raw displaying of per-thread counters If --pretty=raw is given to perf report -T, it now displays one line per-thread per-counter with the raw event id added. We get: # PID TID Name Raw Count 18608 18609 cache-misses 28e 416744 18608 18609 cache-references 28f 6456792 18608 18608 cache-misses 28e 448219 18608 18608 cache-references 28f 7270244 instead of: # PID TID cache-misses cache-references 18608 18609 416744 6456792 18608 18608 448219 7270244 Signed-off-by: Brice Goglin Acked-by: Peter Zijlstra LKML-Reference: <4A802008.5050409@inria.fr> Signed-off-by: Ingo Molnar --- tools/perf/builtin-report.c | 11 +++++++- tools/perf/util/values.c | 62 ++++++++++++++++++++++++++++++++++++++++++++- tools/perf/util/values.h | 3 ++- 3 files changed, 73 insertions(+), 3 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 41639182fb3f..2357c66fb91d 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -57,6 +57,9 @@ static int show_nr_samples; static int show_threads; static struct perf_read_values show_threads_values; +static char default_pretty_printing_style[] = "normal"; +static char *pretty_printing_style = default_pretty_printing_style; + static unsigned long page_size; static unsigned long mmap_window = 32; @@ -1401,6 +1404,9 @@ static size_t output__fprintf(FILE *fp, u64 total_samples) size_t ret = 0; unsigned int width; char *col_width = col_width_list_str; + int raw_printing_style; + + raw_printing_style = !strcmp(pretty_printing_style, "raw"); init_rem_hits(); @@ -1478,7 +1484,8 @@ print_entries: free(rem_sq_bracket); if (show_threads) - perf_read_values_display(fp, &show_threads_values); + perf_read_values_display(fp, &show_threads_values, + raw_printing_style); return ret; } @@ -2091,6 +2098,8 @@ static const struct option options[] = { "Show a column with the number of samples"), OPT_BOOLEAN('T', "threads", &show_threads, "Show per-thread event counters"), + OPT_STRING(0, "pretty", &pretty_printing_style, "key", + "pretty printing style key: normal raw"), OPT_STRING('s', "sort", &sort_order, "key[,key2...]", "sort by key(s): pid, comm, dso, symbol, parent"), OPT_BOOLEAN('P', "full-paths", &full_paths, diff --git a/tools/perf/util/values.c b/tools/perf/util/values.c index 8551c0b8b233..614cfaf4712a 100644 --- a/tools/perf/util/values.c +++ b/tools/perf/util/values.c @@ -126,7 +126,8 @@ void perf_read_values_add_value(struct perf_read_values *values, values->value[tindex][cindex] = value; } -void perf_read_values_display(FILE *fp, struct perf_read_values *values) +static void perf_read_values__display_pretty(FILE *fp, + struct perf_read_values *values) { int i, j; int pidwidth, tidwidth; @@ -169,3 +170,62 @@ void perf_read_values_display(FILE *fp, struct perf_read_values *values) fprintf(fp, "\n"); } } + +static void perf_read_values__display_raw(FILE *fp, + struct perf_read_values *values) +{ + int width, pidwidth, tidwidth, namewidth, rawwidth, countwidth; + int i, j; + + tidwidth = 3; /* TID */ + pidwidth = 3; /* PID */ + namewidth = 4; /* "Name" */ + rawwidth = 3; /* "Raw" */ + countwidth = 5; /* "Count" */ + + for (i = 0; i < values->threads; i++) { + width = snprintf(NULL, 0, "%d", values->pid[i]); + if (width > pidwidth) + pidwidth = width; + width = snprintf(NULL, 0, "%d", values->tid[i]); + if (width > tidwidth) + tidwidth = width; + } + for (j = 0; j < values->counters; j++) { + width = strlen(values->countername[j]); + if (width > namewidth) + namewidth = width; + width = snprintf(NULL, 0, "%llx", values->counterrawid[j]); + if (width > rawwidth) + rawwidth = width; + } + for (i = 0; i < values->threads; i++) { + for (j = 0; j < values->counters; j++) { + width = snprintf(NULL, 0, "%Lu", values->value[i][j]); + if (width > countwidth) + countwidth = width; + } + } + + fprintf(fp, "# %*s %*s %*s %*s %*s\n", + pidwidth, "PID", tidwidth, "TID", + namewidth, "Name", rawwidth, "Raw", + countwidth, "Count"); + for (i = 0; i < values->threads; i++) + for (j = 0; j < values->counters; j++) + fprintf(fp, " %*d %*d %*s %*llx %*Lu\n", + pidwidth, values->pid[i], + tidwidth, values->tid[i], + namewidth, values->countername[j], + rawwidth, values->counterrawid[j], + countwidth, values->value[i][j]); +} + +void perf_read_values_display(FILE *fp, struct perf_read_values *values, + int raw) +{ + if (raw) + perf_read_values__display_raw(fp, values); + else + perf_read_values__display_pretty(fp, values); +} diff --git a/tools/perf/util/values.h b/tools/perf/util/values.h index e41be5e86e6b..f8960fde0547 100644 --- a/tools/perf/util/values.h +++ b/tools/perf/util/values.h @@ -21,6 +21,7 @@ void perf_read_values_add_value(struct perf_read_values *values, u32 pid, u32 tid, u64 rawid, char *name, u64 value); -void perf_read_values_display(FILE *fp, struct perf_read_values *values); +void perf_read_values_display(FILE *fp, struct perf_read_values *values, + int raw); #endif /* _PERF_VALUES_H */ -- cgit v1.2.3 From 392741e0a4e17c82e3978b7fcbf04291294dc0a1 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 7 Aug 2009 15:20:48 -0700 Subject: futex: Fix handling of bad requeue syscall pairing If futex_requeue(requeue_pi=1) finds a futex_q that was created by a call other the futex_wait_requeue_pi(), the q.rt_waiter may be null. If so, this will result in an oops from the following call graph: futex_requeue() rt_mutex_start_proxy_lock() task_blocks_on_rt_mutex() waiter->task dereference OOPS We currently WARN_ON() if this is detected, clearly this is inadequate. If we detect a mispairing in futex_requeue(), bail out, seding -EINVAL to user-space. V2: Fix parenthesis warnings. Signed-off-by: Darren Hart Acked-by: Peter Zijlstra Cc: Steven Rostedt Cc: John Kacur Cc: Eric Dumazet Cc: Dinakar Guniguntala Cc: John Stultz LKML-Reference: <4A7CA8C0.7010809@us.ibm.com> Signed-off-by: Ingo Molnar --- kernel/futex.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 8cc3ee1363a0..e18cfbdc7190 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1256,8 +1256,15 @@ retry_private: if (!match_futex(&this->key, &key1)) continue; - WARN_ON(!requeue_pi && this->rt_waiter); - WARN_ON(requeue_pi && !this->rt_waiter); + /* + * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always + * be paired with each other and no other futex ops. + */ + if ((requeue_pi && !this->rt_waiter) || + (!requeue_pi && this->rt_waiter)) { + ret = -EINVAL; + break; + } /* * Wake nr_wake waiters. For requeue_pi, if we acquired the -- cgit v1.2.3 From 3e03bbeac541856aaaf1ce1ab0250b6a490e4099 Mon Sep 17 00:00:00 2001 From: Shunichi Fuji Date: Tue, 11 Aug 2009 03:34:40 +0900 Subject: x86: Add reboot quirk for every 5 series MacBook/Pro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reboot does not work on my MacBook Pro 13 inch (MacBookPro5,5) too. It seems all unibody MacBook and MacBookPro require PCI reboot handling, i guess. Following model/machine ID list shows unibody MacBook/Pro have the 5 series of model number: http://www.everymac.com/systems/by_capability/macs-by-machine-model-machine-id.html Signed-off-by: Shunichi Fuji Cc: Ozan Çağlayan LKML-Reference: <30046e3b0908101134p6487ddbftd8776e4ddef204be@mail.gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 9eb897603705..a06e8d101844 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -418,20 +418,20 @@ static int __init set_pci_reboot(const struct dmi_system_id *d) } static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { - { /* Handle problems with rebooting on Apple MacBook5,2 */ + { /* Handle problems with rebooting on Apple MacBook5 */ .callback = set_pci_reboot, - .ident = "Apple MacBook", + .ident = "Apple MacBook5", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5,2"), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"), }, }, - { /* Handle problems with rebooting on Apple MacBookPro5,1 */ + { /* Handle problems with rebooting on Apple MacBookPro5 */ .callback = set_pci_reboot, - .ident = "Apple MacBookPro5,1", + .ident = "Apple MacBookPro5", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5,1"), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), }, }, { } -- cgit v1.2.3 From a46c3335551b9b053b2eea551ea7ba3719de03bb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 10 Aug 2009 22:01:28 +0200 Subject: Revert "futex: Update woken requeued futex_q lock_ptr" This reverts commit 00235fe25eba6d3a13f3349b2e3a2d94b699a414. v2 being merged. --- kernel/futex.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 956e58f469a3..d077201b393d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1010,24 +1010,19 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue * q: the futex_q * key: the key of the requeue target futex - * hb: the hash_bucket of the requeue target futex * * During futex_requeue, with requeue_pi=1, it is possible to acquire the * target futex if it is uncontended or via a lock steal. Set the futex_q key * to the requeue target futex so the waiter can detect the wakeup on the right * futex, but remove it from the hb and NULL the rt_waiter so it can detect - * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock - * to protect access to the pi_state to fixup the owner later. Must be called - * with the q->lock_ptr held. + * atomic lock acquisition. Must be called with the q->lock_ptr held. */ static inline -void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, - struct futex_hash_bucket *hb) +void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) { drop_futex_key_refs(&q->key); get_futex_key_refs(key); q->key = *key; - q->lock_ptr = &hb->lock; WARN_ON(plist_node_empty(&q->list)); plist_del(&q->list, &q->list.plist); @@ -1093,7 +1088,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, set_waiters); if (ret == 1) - requeue_pi_wake_futex(top_waiter, key2, hb2); + requeue_pi_wake_futex(top_waiter, key2); return ret; } @@ -1278,7 +1273,7 @@ retry_private: this->task, 1); if (ret == 1) { /* We got the lock. */ - requeue_pi_wake_futex(this, &key2, hb2); + requeue_pi_wake_futex(this, &key2); continue; } else if (ret) { /* -EDEADLK */ -- cgit v1.2.3 From 5b7e88edc6193f36941bccbfd5ed9ed5fe27d2e1 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 31 Jul 2009 09:41:40 +0800 Subject: x86, mce: Support specifying context for software mce injection The cpu context is specified via the new mce.inject_flags fields. This allows more realistic machine check testing in different situations. "RANDOM" context is implemented via NMI broadcasting to add randomization to testing. AK: Fix NMI broadcasting check. Fix 32-bit building. Some race fixes. Move to module. Various changes ChangeLog: v3: - Re-based on latest x86-tip.git/mce4 - Fix 32-bit building v2: - Re-base on latest x86-tip.git/mce3 Signed-off-by: Huang Ying Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 11 ++- arch/x86/kernel/cpu/mcheck/mce-inject.c | 156 ++++++++++++++++++++++++++------ 2 files changed, 135 insertions(+), 32 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index ad7535372918..8945be9ad2b1 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -38,6 +38,13 @@ #define MCM_ADDR_MEM 3 /* memory address */ #define MCM_ADDR_GENERIC 7 /* generic */ +#define MCJ_CTX_MASK 3 +#define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK) +#define MCJ_CTX_RANDOM 0 /* inject context: random */ +#define MCJ_CTX_PROCESS 1 /* inject context: process */ +#define MCJ_CTX_IRQ 2 /* inject context: IRQ */ +#define MCJ_NMI_BROADCAST 4 /* do NMI broadcasting */ + /* Fields are zero when not available */ struct mce { __u64 status; @@ -48,8 +55,8 @@ struct mce { __u64 tsc; /* cpu time stamp counter */ __u64 time; /* wall time_t when error was detected */ __u8 cpuvendor; /* cpu vendor as encoded in system.h */ - __u8 pad1; - __u16 pad2; + __u8 inject_flags; /* software inject flags */ + __u16 pad; __u32 cpuid; /* CPUID 1 EAX */ __u8 cs; /* code segment */ __u8 bank; /* machine check bank */ diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index a3a235a53f09..ad5d92790ebc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -18,7 +18,12 @@ #include #include #include +#include +#include +#include +#include #include +#include /* Update fake mce registers on current CPU. */ static void inject_mce(struct mce *m) @@ -39,44 +44,141 @@ static void inject_mce(struct mce *m) i->finished = 1; } -struct delayed_mce { - struct timer_list timer; - struct mce m; +static void raise_corrected(struct mce *m) +{ + unsigned long flags; + mce_banks_t b; + + memset(&b, 0xff, sizeof(mce_banks_t)); + local_irq_save(flags); + machine_check_poll(0, &b); + local_irq_restore(flags); + m->finished = 0; +} + +static void raise_uncorrected(struct mce *m, struct pt_regs *pregs) +{ + struct pt_regs regs; + unsigned long flags; + + if (!pregs) { + memset(®s, 0, sizeof(struct pt_regs)); + regs.ip = m->ip; + regs.cs = m->cs; + pregs = ®s; + } + /* in mcheck exeception handler, irq will be disabled */ + local_irq_save(flags); + do_machine_check(pregs, 0); + local_irq_restore(flags); + m->finished = 0; +} + +static cpumask_t mce_inject_cpumask; + +static int mce_raise_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct die_args *args = (struct die_args *)data; + int cpu = smp_processor_id(); + struct mce *m = &__get_cpu_var(injectm); + if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask)) + return NOTIFY_DONE; + cpu_clear(cpu, mce_inject_cpumask); + if (m->status & MCI_STATUS_UC) + raise_uncorrected(m, args->regs); + else if (m->status) + raise_corrected(m); + return NOTIFY_STOP; +} + +static struct notifier_block mce_raise_nb = { + .notifier_call = mce_raise_notify, + .priority = 1000, }; /* Inject mce on current CPU */ -static void raise_mce(unsigned long data) +static int raise_local(struct mce *m) { - struct delayed_mce *dm = (struct delayed_mce *)data; - struct mce *m = &dm->m; + int context = MCJ_CTX(m->inject_flags); + int ret = 0; int cpu = m->extcpu; - inject_mce(m); if (m->status & MCI_STATUS_UC) { - struct pt_regs regs; - memset(®s, 0, sizeof(struct pt_regs)); - regs.ip = m->ip; - regs.cs = m->cs; printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); - do_machine_check(®s, 0); + switch (context) { + case MCJ_CTX_IRQ: + /* + * Could do more to fake interrupts like + * calling irq_enter, but the necessary + * machinery isn't exported currently. + */ + /*FALL THROUGH*/ + case MCJ_CTX_PROCESS: + raise_uncorrected(m, NULL); + break; + default: + printk(KERN_INFO "Invalid MCE context\n"); + ret = -EINVAL; + } printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); - } else { - mce_banks_t b; - memset(&b, 0xff, sizeof(mce_banks_t)); + } else if (m->status) { printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); - machine_check_poll(0, &b); + raise_corrected(m); mce_notify_irq(); - printk(KERN_INFO "Finished machine check poll on CPU %d\n", - cpu); - } - kfree(dm); + printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu); + } else + m->finished = 0; + + return ret; +} + +static void raise_mce(struct mce *m) +{ + int context = MCJ_CTX(m->inject_flags); + + inject_mce(m); + + if (context == MCJ_CTX_RANDOM) + return; + +#ifdef CONFIG_X86_LOCAL_APIC + if (m->inject_flags & MCJ_NMI_BROADCAST) { + unsigned long start; + int cpu; + get_online_cpus(); + mce_inject_cpumask = cpu_online_map; + cpu_clear(get_cpu(), mce_inject_cpumask); + for_each_online_cpu(cpu) { + struct mce *mcpu = &per_cpu(injectm, cpu); + if (!mcpu->finished || + MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) + cpu_clear(cpu, mce_inject_cpumask); + } + if (!cpus_empty(mce_inject_cpumask)) + apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR); + start = jiffies; + while (!cpus_empty(mce_inject_cpumask)) { + if (!time_before(jiffies, start + 2*HZ)) { + printk(KERN_ERR + "Timeout waiting for mce inject NMI %lx\n", + *cpus_addr(mce_inject_cpumask)); + break; + } + cpu_relax(); + } + raise_local(m); + put_cpu(); + put_online_cpus(); + } else +#endif + raise_local(m); } /* Error injection interface */ static ssize_t mce_write(struct file *filp, const char __user *ubuf, size_t usize, loff_t *off) { - struct delayed_mce *dm; struct mce m; if (!capable(CAP_SYS_ADMIN)) @@ -96,19 +198,12 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) return -EINVAL; - dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL); - if (!dm) - return -ENOMEM; - /* * Need to give user space some time to set everything up, * so do it a jiffie or two later everywhere. - * Should we use a hrtimer here for better synchronization? */ - memcpy(&dm->m, &m, sizeof(struct mce)); - setup_timer(&dm->timer, raise_mce, (unsigned long)dm); - dm->timer.expires = jiffies + 2; - add_timer_on(&dm->timer, m.extcpu); + schedule_timeout(2); + raise_mce(&m); return usize; } @@ -116,6 +211,7 @@ static int inject_init(void) { printk(KERN_INFO "Machine check injector initialized\n"); mce_chrdev_ops.write = mce_write; + register_die_notifier(&mce_raise_nb); return 0; } -- cgit v1.2.3 From 0dcc66851f1091af421416c28a9458836885f522 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 31 Jul 2009 09:41:41 +0800 Subject: x86, mce: Support specifying raise mode for software MCE injection Raise mode include raising as exception or raising as poll, it is specified via the mce.inject_flags field. This can be used to specify raise mode of UCNA, which is UC error but raised not as exception. And this can be used to test the filter code of poll handler or exception handler too. For example, enforce a poll raise mode for a fatal MCE. ChangeLog: v2: - Re-base on latest x86-tip.git/mce3 Signed-off-by: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 1 + arch/x86/kernel/cpu/mcheck/mce-inject.c | 16 ++++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 8945be9ad2b1..b608a64c5814 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -44,6 +44,7 @@ #define MCJ_CTX_PROCESS 1 /* inject context: process */ #define MCJ_CTX_IRQ 2 /* inject context: IRQ */ #define MCJ_NMI_BROADCAST 4 /* do NMI broadcasting */ +#define MCJ_EXCEPTION 8 /* raise as exception */ /* Fields are zero when not available */ struct mce { diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index ad5d92790ebc..7029f0e2acad 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -44,7 +44,7 @@ static void inject_mce(struct mce *m) i->finished = 1; } -static void raise_corrected(struct mce *m) +static void raise_poll(struct mce *m) { unsigned long flags; mce_banks_t b; @@ -56,7 +56,7 @@ static void raise_corrected(struct mce *m) m->finished = 0; } -static void raise_uncorrected(struct mce *m, struct pt_regs *pregs) +static void raise_exception(struct mce *m, struct pt_regs *pregs) { struct pt_regs regs; unsigned long flags; @@ -85,10 +85,10 @@ static int mce_raise_notify(struct notifier_block *self, if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask)) return NOTIFY_DONE; cpu_clear(cpu, mce_inject_cpumask); - if (m->status & MCI_STATUS_UC) - raise_uncorrected(m, args->regs); + if (m->inject_flags & MCJ_EXCEPTION) + raise_exception(m, args->regs); else if (m->status) - raise_corrected(m); + raise_poll(m); return NOTIFY_STOP; } @@ -104,7 +104,7 @@ static int raise_local(struct mce *m) int ret = 0; int cpu = m->extcpu; - if (m->status & MCI_STATUS_UC) { + if (m->inject_flags & MCJ_EXCEPTION) { printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); switch (context) { case MCJ_CTX_IRQ: @@ -115,7 +115,7 @@ static int raise_local(struct mce *m) */ /*FALL THROUGH*/ case MCJ_CTX_PROCESS: - raise_uncorrected(m, NULL); + raise_exception(m, NULL); break; default: printk(KERN_INFO "Invalid MCE context\n"); @@ -124,7 +124,7 @@ static int raise_local(struct mce *m) printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); } else if (m->status) { printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); - raise_corrected(m); + raise_poll(m); mce_notify_irq(); printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu); } else -- cgit v1.2.3 From 5be9ed251f58881dfc3dd6742a81ff9ad1a7bb04 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 31 Jul 2009 09:41:42 +0800 Subject: x86, mce: Move debugfs mce dir creating to mce.c Because more debugfs files under mce dir will be create in mce.c. ChangeLog: v5: - Rebased on x86-tip.git/mce Signed-off-by: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce-internal.h | 1 + arch/x86/kernel/cpu/mcheck/mce-severity.c | 4 +--- arch/x86/kernel/cpu/mcheck/mce.c | 13 +++++++++++++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 6bd51e7ba87b..32996f9fab67 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -22,6 +22,7 @@ struct mce_bank { }; int mce_severity(struct mce *a, int tolerant, char **msg); +struct dentry *mce_get_debugfs_dir(void); extern int mce_ser; diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 51f7c725dab5..bc35a073d151 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -197,7 +197,7 @@ static int __init severities_debugfs_init(void) { struct dentry *dmce = NULL, *fseverities_coverage = NULL; - dmce = debugfs_create_dir("mce", NULL); + dmce = mce_get_debugfs_dir(); if (dmce == NULL) goto err_out; fseverities_coverage = debugfs_create_file("severities-coverage", @@ -209,8 +209,6 @@ static int __init severities_debugfs_init(void) return 0; err_out: - if (dmce) - debugfs_remove(dmce); return -ENOMEM; } late_initcall(severities_debugfs_init); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 1ce6db1f8789..9c7419e459d6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -2003,3 +2004,15 @@ static int __init mcheck_disable(char *str) return 1; } __setup("nomce", mcheck_disable); + +#ifdef CONFIG_DEBUG_FS +struct dentry *mce_get_debugfs_dir(void) +{ + static struct dentry *dmce; + + if (!dmce) + dmce = debugfs_create_dir("mce", NULL); + + return dmce; +} +#endif -- cgit v1.2.3 From bf783f9f7d33576815bc89f9f1856a7309ea2f17 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 31 Jul 2009 09:41:43 +0800 Subject: x86, mce: Fake panic support for MCE testing If "fake panic" mode is turned on, just log panic message instead of go real panic. This is used for testing only, so that the test suite can check for the correct panic message and do regression testing for MCE would go panic. This patch is based on x86-tip.git/mce. ChangeLog: v5: - Rebased on x86-tip.git/mce v4: - Move config file from sysfs to debugfs Signed-off-by: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 75 ++++++++++++++++++++++++++++++++++------ 1 file changed, 64 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 9c7419e459d6..54bd1b2fb4c0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -204,6 +204,9 @@ static void print_mce_tail(void) static atomic_t mce_paniced; +static int fake_panic; +static atomic_t mce_fake_paniced; + /* Panic in progress. Enable interrupts and wait for final IPI */ static void wait_for_panic(void) { @@ -221,15 +224,21 @@ static void mce_panic(char *msg, struct mce *final, char *exp) { int i; - /* - * Make sure only one CPU runs in machine check panic - */ - if (atomic_inc_return(&mce_paniced) > 1) - wait_for_panic(); - barrier(); + if (!fake_panic) { + /* + * Make sure only one CPU runs in machine check panic + */ + if (atomic_inc_return(&mce_paniced) > 1) + wait_for_panic(); + barrier(); - bust_spinlocks(1); - console_verbose(); + bust_spinlocks(1); + console_verbose(); + } else { + /* Don't log too much for fake panic */ + if (atomic_inc_return(&mce_fake_paniced) > 1) + return; + } print_mce_head(); /* First print corrected ones that are still unlogged */ for (i = 0; i < MCE_LOG_LEN; i++) { @@ -256,9 +265,12 @@ static void mce_panic(char *msg, struct mce *final, char *exp) print_mce_tail(); if (exp) printk(KERN_EMERG "Machine check: %s\n", exp); - if (panic_timeout == 0) - panic_timeout = mce_panic_timeout; - panic(msg); + if (!fake_panic) { + if (panic_timeout == 0) + panic_timeout = mce_panic_timeout; + panic(msg); + } else + printk(KERN_EMERG "Fake kernel panic: %s\n", msg); } /* Support code for software error injection */ @@ -2015,4 +2027,45 @@ struct dentry *mce_get_debugfs_dir(void) return dmce; } + +static void mce_reset(void) +{ + cpu_missing = 0; + atomic_set(&mce_fake_paniced, 0); + atomic_set(&mce_executing, 0); + atomic_set(&mce_callin, 0); + atomic_set(&global_nwo, 0); +} + +static int fake_panic_get(void *data, u64 *val) +{ + *val = fake_panic; + return 0; +} + +static int fake_panic_set(void *data, u64 val) +{ + mce_reset(); + fake_panic = val; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, + fake_panic_set, "%llu\n"); + +static int __init mce_debugfs_init(void) +{ + struct dentry *dmce, *ffake_panic; + + dmce = mce_get_debugfs_dir(); + if (!dmce) + return -ENOMEM; + ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, + &fake_panic_fops); + if (!ffake_panic) + return -ENOMEM; + + return 0; +} +late_initcall(mce_debugfs_init); #endif -- cgit v1.2.3 From 0d01f31439c1e4d602bf9fdc924ab66f407f5e38 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sun, 9 Aug 2009 21:44:49 -0700 Subject: x86, mce: therm_throt - change when we print messages My Latitude d630 seems to be handling thermal events in SMI by lowering the max frequency of the CPU till it cools down but still leaks the "everything is normal" events. This spams the console and with high priority printks. Adjust therm_throt driver to only print messages about the fact that temperatire returned back to normal when leaving the throttling state. Also lower the severity of "back to normal" message from KERN_CRIT to KERN_INFO. Signed-off-by: Dmitry Torokhov Acked-by: H. Peter Anvin LKML-Reference: <20090810051513.0558F526EC9@mailhub.coreip.homeip.net> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index bff8dd191dd5..8bc64cfbe936 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -36,6 +36,7 @@ static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); +static DEFINE_PER_CPU(bool, thermal_throttle_active); static atomic_t therm_throt_en = ATOMIC_INIT(0); @@ -96,24 +97,27 @@ static int therm_throt_process(int curr) { unsigned int cpu = smp_processor_id(); __u64 tmp_jiffs = get_jiffies_64(); + bool was_throttled = __get_cpu_var(thermal_throttle_active); + bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr; - if (curr) + if (is_throttled) __get_cpu_var(thermal_throttle_count)++; - if (time_before64(tmp_jiffs, __get_cpu_var(next_check))) + if (!(was_throttled ^ is_throttled) && + time_before64(tmp_jiffs, __get_cpu_var(next_check))) return 0; __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; /* if we just entered the thermal event */ - if (curr) { + if (is_throttled) { printk(KERN_CRIT "CPU%d: Temperature above threshold, " - "cpu clock throttled (total events = %lu)\n", cpu, - __get_cpu_var(thermal_throttle_count)); + "cpu clock throttled (total events = %lu)\n", + cpu, __get_cpu_var(thermal_throttle_count)); add_taint(TAINT_MACHINE_CHECK); - } else { - printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu); + } else if (was_throttled) { + printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); } return 1; -- cgit v1.2.3 From 3c581a7f94542341bf0da496a226b44ac63521a8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Aug 2009 10:47:36 +0200 Subject: perf_counter, x86: Fix lapic printk message Instead of this garbled bootup on UP Pentium-M systems: [ 0.015048] Performance Counters: [ 0.016004] no Local APIC, try rebooting with lapicno PMU driver, software counters only. Print: [ 0.015050] Performance Counters: [ 0.016004] no APIC, boot with the "lapic" boot parameter to force-enable it. [ 0.017003] no PMU driver, software counters only. Cf: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a7aa8f900954..40e233a24d9f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1590,7 +1590,7 @@ static int p6_pmu_init(void) } if (!cpu_has_apic) { - pr_info("no Local APIC, try rebooting with lapic"); + pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); return -ENODEV; } -- cgit v1.2.3 From f64ccccb8afa43abdd63fcbd230f818d6ea0883f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Aug 2009 10:26:33 +0200 Subject: perf_counter, x86: Fix generic cache events on P6-mobile CPUs Johannes Stezenbach reported that 'perf stat' does not count cache-miss and cache-references events on his Pentium-M based laptop. This is because we left them blank in p6_perfmon_event_map[], fill them in. Reported-by: Johannes Stezenbach Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 40e233a24d9f..fffc126dbdf0 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -72,8 +72,8 @@ static const u64 p6_perfmon_event_map[] = { [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0000, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0000, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, -- cgit v1.2.3 From 83f19150242ee57730a4331b6ae9a9b90ccc718c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Aug 2009 10:40:08 +0200 Subject: perf_counter, x86: Fix/improve apic fallback Johannes Stezenbach reported that his Pentium-M based laptop does not have the local APIC enabled by default, and hence perfcounters do not get initialized. Add a fallback for this case: allow non-sampled counters and return with an error on sampled counters. This allows 'perf stat' to work out of box - and allows 'perf top' and 'perf record' to fall back on a hrtimer based sampling method. ( Passing 'lapic' on the boot line will allow hardware sampling to occur - but if the APIC is disabled permanently by the hardware then this fallback still allows more systems to use perfcounters. ) Also decouple perfcounter support from X86_LOCAL_APIC. Reported-by: Johannes Stezenbach Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- arch/x86/kernel/cpu/perf_counter.c | 34 ++++++++++++++++++++++++++++++---- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 738bdc6b0f8b..13ffa5df37d7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -24,6 +24,7 @@ config X86 select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE + select HAVE_PERF_COUNTERS if (!M386 && !M486) select HAVE_IOREMAP_PROT select HAVE_KPROBES select ARCH_WANT_OPTIONAL_GPIOLIB @@ -742,7 +743,6 @@ config X86_UP_IOAPIC config X86_LOCAL_APIC def_bool y depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC - select HAVE_PERF_COUNTERS if (!M386 && !M486) config X86_IO_APIC def_bool y diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index fffc126dbdf0..76441868ee27 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -55,6 +55,7 @@ struct x86_pmu { int num_counters_fixed; int counter_bits; u64 counter_mask; + int apic; u64 max_period; u64 intel_ctrl; }; @@ -613,6 +614,7 @@ static DEFINE_MUTEX(pmc_reserve_mutex); static bool reserve_pmc_hardware(void) { +#ifdef X86_LOCAL_APIC int i; if (nmi_watchdog == NMI_LOCAL_APIC) @@ -627,9 +629,11 @@ static bool reserve_pmc_hardware(void) if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) goto eventsel_fail; } +#endif return true; +#ifdef X86_LOCAL_APIC eventsel_fail: for (i--; i >= 0; i--) release_evntsel_nmi(x86_pmu.eventsel + i); @@ -644,10 +648,12 @@ perfctr_fail: enable_lapic_nmi_watchdog(); return false; +#endif } static void release_pmc_hardware(void) { +#ifdef X86_LOCAL_APIC int i; for (i = 0; i < x86_pmu.num_counters; i++) { @@ -657,6 +663,7 @@ static void release_pmc_hardware(void) if (nmi_watchdog == NMI_LOCAL_APIC) enable_lapic_nmi_watchdog(); +#endif } static void hw_perf_counter_destroy(struct perf_counter *counter) @@ -748,6 +755,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; atomic64_set(&hwc->period_left, hwc->sample_period); + } else { + /* + * If we have a PMU initialized but no APIC + * interrupts, we cannot sample hardware + * counters (user-space has to fall back and + * sample via a hrtimer based software counter): + */ + if (!x86_pmu.apic) + return -EOPNOTSUPP; } counter->destroy = hw_perf_counter_destroy; @@ -1449,18 +1465,22 @@ void smp_perf_pending_interrupt(struct pt_regs *regs) void set_perf_counter_pending(void) { +#ifdef X86_LOCAL_APIC apic->send_IPI_self(LOCAL_PENDING_VECTOR); +#endif } void perf_counters_lapic_init(void) { - if (!x86_pmu_initialized()) +#ifdef X86_LOCAL_APIC + if (!x86_pmu.apic || !x86_pmu_initialized()) return; /* * Always use NMI for PMU */ apic_write(APIC_LVTPC, APIC_DM_NMI); +#endif } static int __kprobes @@ -1484,7 +1504,9 @@ perf_counter_nmi_handler(struct notifier_block *self, regs = args->regs; +#ifdef X86_LOCAL_APIC apic_write(APIC_LVTPC, APIC_DM_NMI); +#endif /* * Can't rely on the handled return value to say it was our NMI, two * counters could trigger 'simultaneously' raising two back-to-back NMIs. @@ -1515,6 +1537,7 @@ static struct x86_pmu p6_pmu = { .event_map = p6_pmu_event_map, .raw_event = p6_pmu_raw_event, .max_events = ARRAY_SIZE(p6_perfmon_event_map), + .apic = 1, .max_period = (1ULL << 31) - 1, .version = 0, .num_counters = 2, @@ -1541,6 +1564,7 @@ static struct x86_pmu intel_pmu = { .event_map = intel_pmu_event_map, .raw_event = intel_pmu_raw_event, .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of @@ -1564,6 +1588,7 @@ static struct x86_pmu amd_pmu = { .num_counters = 4, .counter_bits = 48, .counter_mask = (1ULL << 48) - 1, + .apic = 1, /* use highest bit to detect overflow */ .max_period = (1ULL << 47) - 1, }; @@ -1589,13 +1614,14 @@ static int p6_pmu_init(void) return -ENODEV; } + x86_pmu = p6_pmu; + if (!cpu_has_apic) { pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); - return -ENODEV; + pr_info("no hardware sampling interrupt available.\n"); + x86_pmu.apic = 0; } - x86_pmu = p6_pmu; - return 0; } -- cgit v1.2.3 From fbd8b1819e80ac5a176d085fdddc3a34d1499318 Mon Sep 17 00:00:00 2001 From: Kevin Winchester Date: Mon, 10 Aug 2009 19:56:45 -0300 Subject: x86: Clear incorrectly forced X86_FEATURE_LAHF_LM flag Due to an erratum with certain AMD Athlon 64 processors, the BIOS may need to force enable the LAHF_LM capability. Unfortunately, in at least one case, the BIOS does this even for processors that do not support the functionality. Add a specific check that will clear the feature bit for processors known not to support the LAHF/SAHF instructions. Signed-off-by: Kevin Winchester Acked-by: Borislav Petkov LKML-Reference: <4A80A5AD.2000209@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e2485b03f1cf..63fddcd082cd 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -400,6 +400,13 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) level = cpuid_eax(1); if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + /* + * Some BIOSes incorrectly force this feature, but only K8 + * revision D (model = 0x14) and later actually support it. + */ + if (c->x86_model < 0x14) + clear_cpu_cap(c, X86_FEATURE_LAHF_LM); } if (c->x86 == 0x10 || c->x86 == 0x11) set_cpu_cap(c, X86_FEATURE_REP_GOOD); -- cgit v1.2.3