From b3a0cb456d848e10b2f7b371ba05e44f1384520a Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 3 Jun 2008 17:30:20 -0700 Subject: x86: extend percpu ops to 64 bit * x86 percpu ops now will work on 64 bit too, so add the missing 8 byte cases. * Add a few atomic ops that will be useful in the future: x86_xchg_percpu() x86_cmpxchg_percpu(). x86_inc_percpu() - Increment by one can generate more efficient x86_dec_percpu() instructions and inc/dec will be supported by cpu ops later. * Use per_cpu_var() instead of per_cpu__##xxx. Signed-off-by: Christoph Lameter Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- include/asm-x86/percpu.h | 83 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 78 insertions(+), 5 deletions(-) diff --git a/include/asm-x86/percpu.h b/include/asm-x86/percpu.h index 736fc3bb8e1e..d85e5aaca7b1 100644 --- a/include/asm-x86/percpu.h +++ b/include/asm-x86/percpu.h @@ -108,6 +108,11 @@ do { \ : "+m" (var) \ : "ri" ((T__)val)); \ break; \ + case 8: \ + asm(op "q %1,"__percpu_seg"%0" \ + : "+m" (var) \ + : "ri" ((T__)val)); \ + break; \ default: __bad_percpu_size(); \ } \ } while (0) @@ -131,16 +136,84 @@ do { \ : "=r" (ret__) \ : "m" (var)); \ break; \ + case 8: \ + asm(op "q "__percpu_seg"%1,%0" \ + : "=r" (ret__) \ + : "m" (var)); \ + break; \ default: __bad_percpu_size(); \ } \ ret__; \ }) -#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var) -#define x86_write_percpu(var, val) percpu_to_op("mov", per_cpu__##var, val) -#define x86_add_percpu(var, val) percpu_to_op("add", per_cpu__##var, val) -#define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu__##var, val) -#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val) +#define percpu_addr_op(op, var) \ +({ \ + switch (sizeof(var)) { \ + case 1: \ + asm(op "b "__percpu_seg"%0" \ + : : "m"(var)); \ + break; \ + case 2: \ + asm(op "w "__percpu_seg"%0" \ + : : "m"(var)); \ + break; \ + case 4: \ + asm(op "l "__percpu_seg"%0" \ + : : "m"(var)); \ + break; \ + case 8: \ + asm(op "q "__percpu_seg"%0" \ + : : "m"(var)); \ + break; \ + default: __bad_percpu_size(); \ + } \ +}) + +#define percpu_cmpxchg_op(var, old, new) \ +({ \ + typeof(var) prev; \ + switch (sizeof(var)) { \ + case 1: \ + asm("cmpxchgb %b1, "__percpu_seg"%2" \ + : "=a"(prev) \ + : "q"(new), "m"(var), "0"(old) \ + : "memory"); \ + break; \ + case 2: \ + asm("cmpxchgw %w1, "__percpu_seg"%2" \ + : "=a"(prev) \ + : "r"(new), "m"(var), "0"(old) \ + : "memory"); \ + break; \ + case 4: \ + asm("cmpxchgl %k1, "__percpu_seg"%2" \ + : "=a"(prev) \ + : "r"(new), "m"(var), "0"(old) \ + : "memory"); \ + break; \ + case 8: \ + asm("cmpxchgq %1, "__percpu_seg"%2" \ + : "=a"(prev) \ + : "r"(new), "m"(var), "0"(old) \ + : "memory"); \ + break; \ + default: \ + __bad_percpu_size(); \ + } \ + return prev; \ +}) + +#define x86_read_percpu(var) percpu_from_op("mov", per_cpu_var(var)) +#define x86_write_percpu(var, val) percpu_to_op("mov", per_cpu_var(var), val) +#define x86_add_percpu(var, val) percpu_to_op("add", per_cpu_var(var), val) +#define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu_var(var), val) +#define x86_inc_percpu(var) percpu_addr_op("inc", per_cpu_var(var)) +#define x86_dec_percpu(var) percpu_addr_op("dec", per_cpu_var(var)) +#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu_var(var), val) +#define x86_xchg_percpu(var, val) percpu_to_op("xchg", per_cpu_var(var), val) +#define x86_cmpxchg_percpu(var, old, new) \ + percpu_cmpxchg_op(per_cpu_var(var), old, new) + #endif /* !__ASSEMBLY__ */ #endif /* !CONFIG_X86_64 */ #endif /* _ASM_X86_PERCPU_H_ */ -- cgit v1.2.3 From d3794979a8a80c222ce9d016a6dfc4bed36965d0 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 3 Jun 2008 17:30:19 -0700 Subject: Zero based percpu: infrastructure to rebase the per cpu area to zero * Support an option CONFIG_HAVE_ZERO_BASED_PER_CPU to make offsets for per cpu variables to start at zero. If a percpu area starts at zero then: - We do not need RELOC_HIDE anymore - Provides for the future capability of architectures providing a per cpu allocator that returns offsets instead of pointers. The offsets would be independent of the processor so that address calculations can be done in a processor independent way. Per cpu instructions can then add the processor specific offset at the last minute possibly in an atomic instruction. The data the linker provides is different for zero based percpu segments: __per_cpu_load -> The address at which the percpu area was loaded __per_cpu_size -> The length of the per cpu area * Removes the &__per_cpu_x in lockdep. The __per_cpu_x are already pointers. There is no need to take the address. * Updates kernel/module.c to be able to deal with a percpu area that is loaded at __per_cpu_load but is accessed at __per_cpu_start. Signed-off-by: Christoph Lameter Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- include/asm-generic/percpu.h | 9 ++++++++- include/asm-generic/sections.h | 10 ++++++++++ include/asm-generic/vmlinux.lds.h | 16 ++++++++++++++++ include/linux/percpu.h | 17 ++++++++++++++++- kernel/lockdep.c | 4 ++-- kernel/module.c | 7 ++++--- 6 files changed, 56 insertions(+), 7 deletions(-) diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index b0e63c672ebd..36d188f47c98 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -45,7 +45,12 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; * Only S390 provides its own means of moving the pointer. */ #ifndef SHIFT_PERCPU_PTR -#define SHIFT_PERCPU_PTR(__p, __offset) RELOC_HIDE((__p), (__offset)) +# ifdef CONFIG_HAVE_ZERO_BASED_PER_CPU +# define SHIFT_PERCPU_PTR(__p, __offset) \ + ((__typeof(__p))(((void *)(__p)) + (__offset))) +# else +# define SHIFT_PERCPU_PTR(__p, __offset) RELOC_HIDE((__p), (__offset)) +# endif /* CONFIG_HAVE_ZERO_BASED_PER_CPU */ #endif /* @@ -70,6 +75,8 @@ extern void setup_per_cpu_areas(void); #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu_var(var))) #define __get_cpu_var(var) per_cpu_var(var) #define __raw_get_cpu_var(var) per_cpu_var(var) +#define SHIFT_PERCPU_PTR(__p, __offset) (__p) +#define per_cpu_offset(x) 0L #endif /* SMP */ diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index 8feeae1f2369..f697620c3e21 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -9,7 +9,17 @@ extern char __bss_start[], __bss_stop[]; extern char __init_begin[], __init_end[]; extern char _sinittext[], _einittext[]; extern char _end[]; +#ifdef CONFIG_HAVE_ZERO_BASED_PER_CPU +extern char __per_cpu_load[]; +extern char ____per_cpu_size[]; +#define __per_cpu_size ((unsigned long)&____per_cpu_size) +#define __per_cpu_start ((char *)0) +#define __per_cpu_end ((char *)__per_cpu_size) +#else extern char __per_cpu_start[], __per_cpu_end[]; +#define __per_cpu_load __per_cpu_start +#define __per_cpu_size (__per_cpu_end - __per_cpu_start) +#endif extern char __kprobes_text_start[], __kprobes_text_end[]; extern char __initdata_begin[], __initdata_end[]; extern char __start_rodata[], __end_rodata[]; diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 69e5c1182fde..465a97ae5f17 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -344,6 +344,21 @@ *(.initcall7.init) \ *(.initcall7s.init) +#ifdef CONFIG_HAVE_ZERO_BASED_PER_CPU +#define PERCPU(align) \ + . = ALIGN(align); \ + percpu : { } :percpu \ + __per_cpu_load = .; \ + .data.percpu 0 : AT(__per_cpu_load - LOAD_OFFSET) { \ + *(.data.percpu.first) \ + *(.data.percpu.shared_aligned) \ + *(.data.percpu) \ + *(.data.percpu.page_aligned) \ + ____per_cpu_size = .; \ + } \ + . = __per_cpu_load + ____per_cpu_size; \ + data : { } :data +#else #define PERCPU(align) \ . = ALIGN(align); \ __per_cpu_start = .; \ @@ -353,3 +368,4 @@ *(.data.percpu.shared_aligned) \ } \ __per_cpu_end = .; +#endif diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 2edacc8e6b8b..3d1a7f6cc738 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -27,7 +27,18 @@ #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \ __attribute__((__section__(".data.percpu.page_aligned"))) \ PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name + +#ifdef CONFIG_HAVE_ZERO_BASED_PER_CPU +#define DEFINE_PER_CPU_FIRST(type, name) \ + __attribute__((__section__(".data.percpu.first"))) \ + PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name #else +#define DEFINE_PER_CPU_FIRST(type, name) \ + DEFINE_PER_CPU(type, name) +#endif + +#else /* !CONFIG_SMP */ + #define DEFINE_PER_CPU(type, name) \ PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name @@ -36,7 +47,11 @@ #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \ DEFINE_PER_CPU(type, name) -#endif + +#define DEFINE_PER_CPU_FIRST(type, name) \ + DEFINE_PER_CPU(type, name) + +#endif /* !CONFIG_SMP */ #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var) #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 81a4e4a3f087..965b9c7e8e86 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -609,8 +609,8 @@ static int static_obj(void *obj) * percpu var? */ for_each_possible_cpu(i) { - start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); - end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM + start = (unsigned long) __per_cpu_start + per_cpu_offset(i); + end = (unsigned long) __per_cpu_start + PERCPU_ENOUGH_ROOM + per_cpu_offset(i); if ((addr >= start) && (addr < end)) diff --git a/kernel/module.c b/kernel/module.c index f5e9491ef7ac..9c895e6df07c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -365,7 +366,7 @@ static void *percpu_modalloc(unsigned long size, unsigned long align, align = PAGE_SIZE; } - ptr = __per_cpu_start; + ptr = __per_cpu_load; for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { /* Extra for alignment requirement. */ extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr; @@ -400,7 +401,7 @@ static void *percpu_modalloc(unsigned long size, unsigned long align, static void percpu_modfree(void *freeme) { unsigned int i; - void *ptr = __per_cpu_start + block_size(pcpu_size[0]); + void *ptr = __per_cpu_load + block_size(pcpu_size[0]); /* First entry is core kernel percpu data. */ for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { @@ -451,7 +452,7 @@ static int percpu_modinit(void) pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated, GFP_KERNEL); /* Static in-kernel percpu data (used). */ - pcpu_size[0] = -(__per_cpu_end-__per_cpu_start); + pcpu_size[0] = -__per_cpu_size; /* Free room. */ pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0]; if (pcpu_size[1] < 0) { -- cgit v1.2.3 From f8d90d947ad983d6f0788778b39eb3d32ce1320f Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 3 Jul 2008 21:30:09 +0200 Subject: percpu: zero based percpu build error on s390 The patch: "Zero based percpu: infrastructure to rebase the per cpu area to zero" causes this build error on !CONFIG_SMP on linux-next / s390: CC arch/s390/kernel/asm-offsets.s In file included from include/asm/percpu.h:35, from include/linux/percpu.h:9, from include/linux/rcupdate.h:39, from include/linux/pid.h:4, from include/linux/sched.h:74, from arch/s390/kernel/asm-offsets.c:7: include/asm-generic/percpu.h:78:1: warning: "SHIFT_PERCPU_PTR" redefined In file included from include/linux/percpu.h:9, from include/linux/rcupdate.h:39, from include/linux/pid.h:4, from include/linux/sched.h:74, from arch/s390/kernel/asm-offsets.c:7: include/asm/percpu.h:25:1: warning: this is the location of the previous definition Fix below. Please merge with original patch. Signed-off-by: Heiko Carstens Cc: Mike Travis Signed-off-by: Ingo Molnar --- include/asm-generic/percpu.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index 36d188f47c98..1c02250353ee 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -75,7 +75,9 @@ extern void setup_per_cpu_areas(void); #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu_var(var))) #define __get_cpu_var(var) per_cpu_var(var) #define __raw_get_cpu_var(var) per_cpu_var(var) -#define SHIFT_PERCPU_PTR(__p, __offset) (__p) +#ifndef SHIFT_PERCPU_PTR +# define SHIFT_PERCPU_PTR(__p, __offset) (__p) +#endif #define per_cpu_offset(x) 0L #endif /* SMP */ -- cgit v1.2.3 From bd8fbdee6562ee526f3c2582a3b373ef195015dd Mon Sep 17 00:00:00 2001 From: Rui Sousa Date: Wed, 3 Sep 2008 17:53:07 +0200 Subject: lockdep: fix compilation when CONFIG_TRACE_IRQFLAGS_SUPPORT is not set This patch fixes compilation if CONFIG_TRACE_IRQFLAGS_SUPPORT is ever disabled (which is currently not allowed by Kconfig). Alternatively we could just remove the option altogether and the associated code paths. Since the compilation error has been in the tree for at least two years and no one noticed it, I guess we don't really have the need for CONFIG_TRACE_IRQFLAGS_SUPPORT=n. Boot tested on x86 UP. Signed-off-by: Rui Sousa Signed-off-by: Ingo Molnar --- include/linux/irqflags.h | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 74bde13224c9..f2993512b3b5 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -52,10 +52,10 @@ # define start_critical_timings() do { } while (0) #endif -#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT - #include +#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT + #define local_irq_enable() \ do { trace_hardirqs_on(); raw_local_irq_enable(); } while (0) #define local_irq_disable() \ @@ -84,21 +84,20 @@ * The local_irq_*() APIs are equal to the raw_local_irq*() * if !TRACE_IRQFLAGS. */ -# define raw_local_irq_disable() local_irq_disable() -# define raw_local_irq_enable() local_irq_enable() -# define raw_local_irq_save(flags) \ +#define local_irq_disable() raw_local_irq_disable() +#define local_irq_enable() raw_local_irq_enable() +#define local_irq_save(flags) \ do { \ typecheck(unsigned long, flags); \ - local_irq_save(flags); \ + raw_local_irq_save(flags); \ } while (0) -# define raw_local_irq_restore(flags) \ +# define local_irq_restore(flags) \ do { \ typecheck(unsigned long, flags); \ - local_irq_restore(flags); \ + raw_local_irq_restore(flags); \ } while (0) #endif /* CONFIG_TRACE_IRQFLAGS_SUPPORT */ -#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT #define safe_halt() \ do { \ trace_hardirqs_on(); \ @@ -124,6 +123,5 @@ typecheck(unsigned long, flags); \ raw_irqs_disabled_flags(flags); \ }) -#endif /* CONFIG_X86 */ #endif -- cgit v1.2.3 From cdad72207d164569cb4bf647eb824a7f93e8d388 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Sep 2008 16:00:57 +0200 Subject: lockstat: documentation update Christoph noted that the documentation doesn't tell in what unit the lockstat times are reported, ammend this. Reported-by: Christoph Hellwig Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- Documentation/lockstat.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/lockstat.txt b/Documentation/lockstat.txt index 4ba4664ce5c3..02f36f5c64fe 100644 --- a/Documentation/lockstat.txt +++ b/Documentation/lockstat.txt @@ -100,6 +100,7 @@ The first lock (05-10) is a read/write lock, and shows two lines above the short separator. The contention points don't match the column descriptors, they have two: contentions and [] symbol. +The integer part of the time values is in us. View the top contending locks: -- cgit v1.2.3 From 8c56250f48347750c82ab18d98d647dcf99ca674 Mon Sep 17 00:00:00 2001 From: Rui Sousa Date: Thu, 4 Sep 2008 19:47:53 +0200 Subject: lockdep, UML: fix compilation when CONFIG_TRACE_IRQFLAGS_SUPPORT is not set Hiroshi Shimamoto reported: > > !TRACE_IRQFLAGS_SUPPORT mode of build for future work, it can be > > restored via a simple revert. > > Hi, it seems that this patch breaks uml build. > > kernel/printk.c: In function 'vprintk': > kernel/printk.c:674: error: implicit declaration of function > 'raw_local_irq_save' kernel/printk.c:772: error: implicit declaration of > function 'raw_local_irq_restore' With the patch bellow it compiles (make ARCH=um with a x86 host), but I'm really out of my league on this one... Reported-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- include/asm-um/system-generic.h | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/include/asm-um/system-generic.h b/include/asm-um/system-generic.h index 5bcfa35e7a22..f1ea4da34fad 100644 --- a/include/asm-um/system-generic.h +++ b/include/asm-um/system-generic.h @@ -4,15 +4,15 @@ #include "asm/arch/system.h" #undef switch_to -#undef local_irq_save -#undef local_irq_restore -#undef local_irq_disable -#undef local_irq_enable -#undef local_save_flags -#undef local_irq_restore -#undef local_irq_enable -#undef local_irq_disable -#undef local_irq_save +#undef raw_local_irq_save +#undef raw_local_irq_restore +#undef raw_local_irq_disable +#undef raw_local_irq_enable +#undef raw_local_save_flags +#undef raw_local_irq_restore +#undef raw_local_irq_enable +#undef raw_local_irq_disable +#undef raw_local_irq_save #undef irqs_disabled extern void *switch_to(void *prev, void *next, void *last); @@ -23,21 +23,21 @@ extern int get_signals(void); extern void block_signals(void); extern void unblock_signals(void); -#define local_save_flags(flags) do { typecheck(unsigned long, flags); \ +#define raw_local_save_flags(flags) do { typecheck(unsigned long, flags); \ (flags) = get_signals(); } while(0) -#define local_irq_restore(flags) do { typecheck(unsigned long, flags); \ +#define raw_local_irq_restore(flags) do { typecheck(unsigned long, flags); \ set_signals(flags); } while(0) -#define local_irq_save(flags) do { local_save_flags(flags); \ - local_irq_disable(); } while(0) +#define raw_local_irq_save(flags) do { raw_local_save_flags(flags); \ + raw_local_irq_disable(); } while(0) -#define local_irq_enable() unblock_signals() -#define local_irq_disable() block_signals() +#define raw_local_irq_enable() unblock_signals() +#define raw_local_irq_disable() block_signals() #define irqs_disabled() \ ({ \ unsigned long flags; \ - local_save_flags(flags); \ + raw_local_save_flags(flags); \ (flags == 0); \ }) -- cgit v1.2.3 From 1b2439dbb703ae8d95a9ce7ece6b7800b80f41f0 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 15 Aug 2008 15:29:38 -0700 Subject: debug: add notifier chain debugging during some development we suspected a case where we left something in a notifier chain that was from a module that was unloaded already... and that sort of thing is rather hard to track down. This patch adds a very simple sanity check (which isn't all that expensive) to make sure the notifier we're about to call is actually from either the kernel itself of from a still-loaded module, avoiding a hard-to-chase-down crash. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- kernel/notifier.c | 16 ++++++++++++++++ lib/Kconfig.debug | 10 ++++++++++ 2 files changed, 26 insertions(+) diff --git a/kernel/notifier.c b/kernel/notifier.c index 823be11584ef..143fdd77dbf7 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -21,6 +21,10 @@ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list); static int notifier_chain_register(struct notifier_block **nl, struct notifier_block *n) { + if (!kernel_text_address((unsigned long)n->notifier_call)) { + WARN(1, "Invalid notifier registered!"); + return 0; + } while ((*nl) != NULL) { if (n->priority > (*nl)->priority) break; @@ -34,6 +38,10 @@ static int notifier_chain_register(struct notifier_block **nl, static int notifier_chain_cond_register(struct notifier_block **nl, struct notifier_block *n) { + if (!kernel_text_address((unsigned long)n->notifier_call)) { + WARN(1, "Invalid notifier registered!"); + return 0; + } while ((*nl) != NULL) { if ((*nl) == n) return 0; @@ -82,6 +90,14 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl, while (nb && nr_to_call) { next_nb = rcu_dereference(nb->next); + +#ifdef CONFIG_DEBUG_NOTIFIERS + if (!kernel_text_address((unsigned long)nb->notifier_call)) { + WARN(1, "Invalid notifier called!"); + nb = next_nb; + continue; + } +#endif ret = nb->notifier_call(nb, val, v); if (nr_calls) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 8b5a7d304a5f..342858fbabbc 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -536,6 +536,16 @@ config DEBUG_SG If unsure, say N. +config DEBUG_NOTIFIERS + bool "Debug notifier call chains" + depends on DEBUG_KERNEL + help + Enable this to turn on sanity checking for notifier call chains. + This is most useful for kernel developers to make sure that + modules properly unregister themselves from notifier chains. + This is a relatively cheap check but if you care about maximum + performance, say N. + config FRAME_POINTER bool "Compile the kernel with frame pointers" depends on DEBUG_KERNEL && \ -- cgit v1.2.3 From fb822db465bd9fd4208eef1af4490539b236c54e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 20 Aug 2008 11:17:40 +0200 Subject: softlockup: increase hung tasks check from 2 minutes to 8 minutes Andrew says: > Seems that about 100% of the reports we get of this warning triggering > are sys_sync, transaction commit, etc. increase the timeout. If it still triggers for people, we can kill it. Signed-off-by: Ingo Molnar --- kernel/softlockup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/softlockup.c b/kernel/softlockup.c index b75b492fbfcf..17a058065331 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -164,7 +164,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = 1024; /* * Zero means infinite timeout - no checking done: */ -unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; +unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480; unsigned long __read_mostly sysctl_hung_task_warnings = 10; -- cgit v1.2.3 From ab7476cf76e560f0efda2a631a70aabe93009025 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 15 Aug 2008 15:29:38 -0700 Subject: debug: add notifier chain debugging, v2 - unbreak ia64 (and powerpc) where function pointers dont point at code but at data (reported by Tony Luck) [ mingo@elte.hu: various cleanups ] Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 3 +++ kernel/extable.c | 16 ++++++++++++++++ kernel/notifier.c | 10 +--------- lib/vsprintf.c | 2 +- 4 files changed, 21 insertions(+), 10 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2651f805ba6d..4e1366b552ae 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -187,6 +187,9 @@ extern unsigned long long memparse(char *ptr, char **retptr); extern int core_kernel_text(unsigned long addr); extern int __kernel_text_address(unsigned long addr); extern int kernel_text_address(unsigned long addr); +extern int func_ptr_is_kernel_text(void *ptr); +extern void *dereference_function_descriptor(void *ptr); + struct pid; extern struct pid *session_of_pgrp(struct pid *pgrp); diff --git a/kernel/extable.c b/kernel/extable.c index a26cb2e17023..adf0cc9c02d6 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -66,3 +66,19 @@ int kernel_text_address(unsigned long addr) return 1; return module_text_address(addr) != NULL; } + +/* + * On some architectures (PPC64, IA64) function pointers + * are actually only tokens to some data that then holds the + * real function address. As a result, to find if a function + * pointer is part of the kernel text, we need to do some + * special dereferencing first. + */ +int func_ptr_is_kernel_text(void *ptr) +{ + unsigned long addr; + addr = (unsigned long) dereference_function_descriptor(ptr); + if (core_kernel_text(addr)) + return 1; + return module_text_address(addr) != NULL; +} diff --git a/kernel/notifier.c b/kernel/notifier.c index 143fdd77dbf7..0f39e398ef60 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -21,10 +21,6 @@ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list); static int notifier_chain_register(struct notifier_block **nl, struct notifier_block *n) { - if (!kernel_text_address((unsigned long)n->notifier_call)) { - WARN(1, "Invalid notifier registered!"); - return 0; - } while ((*nl) != NULL) { if (n->priority > (*nl)->priority) break; @@ -38,10 +34,6 @@ static int notifier_chain_register(struct notifier_block **nl, static int notifier_chain_cond_register(struct notifier_block **nl, struct notifier_block *n) { - if (!kernel_text_address((unsigned long)n->notifier_call)) { - WARN(1, "Invalid notifier registered!"); - return 0; - } while ((*nl) != NULL) { if ((*nl) == n) return 0; @@ -92,7 +84,7 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl, next_nb = rcu_dereference(nb->next); #ifdef CONFIG_DEBUG_NOTIFIERS - if (!kernel_text_address((unsigned long)nb->notifier_call)) { + if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { WARN(1, "Invalid notifier called!"); nb = next_nb; continue; diff --git a/lib/vsprintf.c b/lib/vsprintf.c index d8d1d1142248..f5e5ffb9942f 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -513,7 +513,7 @@ static char *string(char *buf, char *end, char *s, int field_width, int precisio return buf; } -static inline void *dereference_function_descriptor(void *ptr) +void *dereference_function_descriptor(void *ptr) { #if defined(CONFIG_IA64) || defined(CONFIG_PPC64) void *p; -- cgit v1.2.3 From 76b189e91845eab3a9d52bb97f971d312d25652d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Sep 2008 09:57:35 +0200 Subject: lockdep: add might_lock() / might_lock_read() useful to establish a lock dependency in case the actual dependency is rare or hard to trigger. Signed-off-by: Peter Zijlstra Acked-by: Nick Piggin Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 331e5f1c2d8e..0aa657aa8a1e 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -480,4 +480,22 @@ static inline void print_irqtrace_events(struct task_struct *curr) # define lock_map_release(l) do { } while (0) #endif +#ifdef CONFIG_PROVE_LOCKING +# define might_lock(lock) \ +do { \ + typecheck(struct lockdep_map *, &(lock)->dep_map); \ + lock_acquire(&(lock)->dep_map, 0, 0, 0, 2, NULL, _THIS_IP_); \ + lock_release(&(lock)->dep_map, 0, _THIS_IP_); \ +} while (0) +# define might_lock_read(lock) \ +do { \ + typecheck(struct lockdep_map *, &(lock)->dep_map); \ + lock_acquire(&(lock)->dep_map, 0, 0, 1, 2, NULL, _THIS_IP_); \ + lock_release(&(lock)->dep_map, 0, _THIS_IP_); \ +} while (0) +#else +# define might_lock(lock) do { } while (0) +# define might_lock_read(lock) do { } while (0) +#endif + #endif /* __LINUX_LOCKDEP_H */ -- cgit v1.2.3 From c10d38dda1774ed4540380333cabd229eff37094 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 10 Sep 2008 13:37:17 +0200 Subject: x86: some lock annotations for user copy paths copy_to/from_user and all its variants (except the atomic ones) can take a page fault and perform non-trivial work like taking mmap_sem and entering the filesyste/pagecache. Unfortunately, this often escapes lockdep because a common pattern is to use it to read in some arguments just set up from userspace, or write data back to a hot buffer. In those cases, it will be unlikely for page reclaim to get a window in to cause copy_*_user to fault. With the new might_lock primitives, add some annotations to x86. I don't know if I caught all possible faulting points (it's a bit of a maze, and I didn't really look at 32-bit). But this is a starting point. Boots and runs OK so far. Signed-off-by: Nick Piggin Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/lib/usercopy_32.c | 7 ++++++- arch/x86/lib/usercopy_64.c | 4 ++++ include/asm-x86/uaccess.h | 14 ++++++++++++++ include/asm-x86/uaccess_32.h | 10 ++++++++-- include/asm-x86/uaccess_64.h | 12 ++++++++++++ 5 files changed, 44 insertions(+), 3 deletions(-) diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 24e60944971a..8eedde2a9cac 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -33,6 +33,8 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon do { \ int __d0, __d1, __d2; \ might_sleep(); \ + if (current->mm) \ + might_lock_read(¤t->mm->mmap_sem); \ __asm__ __volatile__( \ " testl %1,%1\n" \ " jz 2f\n" \ @@ -120,6 +122,8 @@ EXPORT_SYMBOL(strncpy_from_user); do { \ int __d0; \ might_sleep(); \ + if (current->mm) \ + might_lock_read(¤t->mm->mmap_sem); \ __asm__ __volatile__( \ "0: rep; stosl\n" \ " movl %2,%0\n" \ @@ -148,7 +152,6 @@ do { \ unsigned long clear_user(void __user *to, unsigned long n) { - might_sleep(); if (access_ok(VERIFY_WRITE, to, n)) __do_clear_user(to, n); return n; @@ -191,6 +194,8 @@ long strnlen_user(const char __user *s, long n) unsigned long res, tmp; might_sleep(); + if (current->mm) + might_lock_read(¤t->mm->mmap_sem); __asm__ __volatile__( " testl %0, %0\n" diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index f4df6e7c718b..847d12945998 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -16,6 +16,8 @@ do { \ long __d0, __d1, __d2; \ might_sleep(); \ + if (current->mm) \ + might_lock_read(¤t->mm->mmap_sem); \ __asm__ __volatile__( \ " testq %1,%1\n" \ " jz 2f\n" \ @@ -65,6 +67,8 @@ unsigned long __clear_user(void __user *addr, unsigned long size) { long __d0; might_sleep(); + if (current->mm) + might_lock_read(¤t->mm->mmap_sem); /* no memory constraint because it doesn't change any memory gcc knows about */ asm volatile( diff --git a/include/asm-x86/uaccess.h b/include/asm-x86/uaccess.h index 5f702d1d5218..ad29752a1713 100644 --- a/include/asm-x86/uaccess.h +++ b/include/asm-x86/uaccess.h @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include #include @@ -157,6 +159,9 @@ extern int __get_user_bad(void); int __ret_gu; \ unsigned long __val_gu; \ __chk_user_ptr(ptr); \ + might_sleep(); \ + if (current->mm) \ + might_lock_read(¤t->mm->mmap_sem); \ switch (sizeof(*(ptr))) { \ case 1: \ __get_user_x(1, __ret_gu, __val_gu, ptr); \ @@ -241,6 +246,9 @@ extern void __put_user_8(void); int __ret_pu; \ __typeof__(*(ptr)) __pu_val; \ __chk_user_ptr(ptr); \ + might_sleep(); \ + if (current->mm) \ + might_lock_read(¤t->mm->mmap_sem); \ __pu_val = x; \ switch (sizeof(*(ptr))) { \ case 1: \ @@ -265,6 +273,9 @@ extern void __put_user_8(void); #define __put_user_size(x, ptr, size, retval, errret) \ do { \ retval = 0; \ + might_sleep(); \ + if (current->mm) \ + might_lock_read(¤t->mm->mmap_sem); \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ @@ -317,6 +328,9 @@ do { \ #define __get_user_size(x, ptr, size, retval, errret) \ do { \ retval = 0; \ + might_sleep(); \ + if (current->mm) \ + might_lock_read(¤t->mm->mmap_sem); \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ diff --git a/include/asm-x86/uaccess_32.h b/include/asm-x86/uaccess_32.h index 6fdef39a0bcb..d725e2d703f7 100644 --- a/include/asm-x86/uaccess_32.h +++ b/include/asm-x86/uaccess_32.h @@ -82,8 +82,10 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) static __always_inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n) { - might_sleep(); - return __copy_to_user_inatomic(to, from, n); + might_sleep(); + if (current->mm) + might_lock_read(¤t->mm->mmap_sem); + return __copy_to_user_inatomic(to, from, n); } static __always_inline unsigned long @@ -138,6 +140,8 @@ static __always_inline unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { might_sleep(); + if (current->mm) + might_lock_read(¤t->mm->mmap_sem); if (__builtin_constant_p(n)) { unsigned long ret; @@ -160,6 +164,8 @@ static __always_inline unsigned long __copy_from_user_nocache(void *to, const void __user *from, unsigned long n) { might_sleep(); + if (current->mm) + might_lock_read(¤t->mm->mmap_sem); if (__builtin_constant_p(n)) { unsigned long ret; diff --git a/include/asm-x86/uaccess_64.h b/include/asm-x86/uaccess_64.h index 515d4dce96b5..40a7205fe576 100644 --- a/include/asm-x86/uaccess_64.h +++ b/include/asm-x86/uaccess_64.h @@ -28,6 +28,10 @@ static __always_inline __must_check int __copy_from_user(void *dst, const void __user *src, unsigned size) { int ret = 0; + + might_sleep(); + if (current->mm) + might_lock_read(¤t->mm->mmap_sem); if (!__builtin_constant_p(size)) return copy_user_generic(dst, (__force void *)src, size); switch (size) { @@ -70,6 +74,10 @@ static __always_inline __must_check int __copy_to_user(void __user *dst, const void *src, unsigned size) { int ret = 0; + + might_sleep(); + if (current->mm) + might_lock_read(¤t->mm->mmap_sem); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, src, size); switch (size) { @@ -112,6 +120,10 @@ static __always_inline __must_check int __copy_in_user(void __user *dst, const void __user *src, unsigned size) { int ret = 0; + + might_sleep(); + if (current->mm) + might_lock_read(¤t->mm->mmap_sem); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, (__force void *)src, size); -- cgit v1.2.3 From 3ee1afa308f2a38e5d1e2ad3752ad7abcf480da1 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 10 Sep 2008 13:37:17 +0200 Subject: x86: some lock annotations for user copy paths, v2 - introduce might_fault() - handle the atomic user copy paths correctly [ mingo@elte.hu: move might_sleep() outside of in_atomic(). ] Signed-off-by: Nick Piggin Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/lib/usercopy_32.c | 12 +++--------- arch/x86/lib/usercopy_64.c | 8 ++------ include/asm-x86/uaccess.h | 18 ++++-------------- include/asm-x86/uaccess_32.h | 12 +++--------- include/asm-x86/uaccess_64.h | 12 +++--------- include/linux/kernel.h | 9 +++++++++ mm/memory.c | 15 +++++++++++++++ 7 files changed, 39 insertions(+), 47 deletions(-) diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 8eedde2a9cac..7393152a252e 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -32,9 +32,7 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon #define __do_strncpy_from_user(dst, src, count, res) \ do { \ int __d0, __d1, __d2; \ - might_sleep(); \ - if (current->mm) \ - might_lock_read(¤t->mm->mmap_sem); \ + might_fault(); \ __asm__ __volatile__( \ " testl %1,%1\n" \ " jz 2f\n" \ @@ -121,9 +119,7 @@ EXPORT_SYMBOL(strncpy_from_user); #define __do_clear_user(addr,size) \ do { \ int __d0; \ - might_sleep(); \ - if (current->mm) \ - might_lock_read(¤t->mm->mmap_sem); \ + might_fault(); \ __asm__ __volatile__( \ "0: rep; stosl\n" \ " movl %2,%0\n" \ @@ -193,9 +189,7 @@ long strnlen_user(const char __user *s, long n) unsigned long mask = -__addr_ok(s); unsigned long res, tmp; - might_sleep(); - if (current->mm) - might_lock_read(¤t->mm->mmap_sem); + might_fault(); __asm__ __volatile__( " testl %0, %0\n" diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 847d12945998..64d6c84e6353 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -15,9 +15,7 @@ #define __do_strncpy_from_user(dst,src,count,res) \ do { \ long __d0, __d1, __d2; \ - might_sleep(); \ - if (current->mm) \ - might_lock_read(¤t->mm->mmap_sem); \ + might_fault(); \ __asm__ __volatile__( \ " testq %1,%1\n" \ " jz 2f\n" \ @@ -66,9 +64,7 @@ EXPORT_SYMBOL(strncpy_from_user); unsigned long __clear_user(void __user *addr, unsigned long size) { long __d0; - might_sleep(); - if (current->mm) - might_lock_read(¤t->mm->mmap_sem); + might_fault(); /* no memory constraint because it doesn't change any memory gcc knows about */ asm volatile( diff --git a/include/asm-x86/uaccess.h b/include/asm-x86/uaccess.h index ad29752a1713..39f8420c75d9 100644 --- a/include/asm-x86/uaccess.h +++ b/include/asm-x86/uaccess.h @@ -8,8 +8,6 @@ #include #include #include -#include -#include #include #include @@ -159,9 +157,7 @@ extern int __get_user_bad(void); int __ret_gu; \ unsigned long __val_gu; \ __chk_user_ptr(ptr); \ - might_sleep(); \ - if (current->mm) \ - might_lock_read(¤t->mm->mmap_sem); \ + might_fault(); \ switch (sizeof(*(ptr))) { \ case 1: \ __get_user_x(1, __ret_gu, __val_gu, ptr); \ @@ -246,9 +242,7 @@ extern void __put_user_8(void); int __ret_pu; \ __typeof__(*(ptr)) __pu_val; \ __chk_user_ptr(ptr); \ - might_sleep(); \ - if (current->mm) \ - might_lock_read(¤t->mm->mmap_sem); \ + might_fault(); \ __pu_val = x; \ switch (sizeof(*(ptr))) { \ case 1: \ @@ -273,9 +267,7 @@ extern void __put_user_8(void); #define __put_user_size(x, ptr, size, retval, errret) \ do { \ retval = 0; \ - might_sleep(); \ - if (current->mm) \ - might_lock_read(¤t->mm->mmap_sem); \ + might_fault(); \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ @@ -328,9 +320,7 @@ do { \ #define __get_user_size(x, ptr, size, retval, errret) \ do { \ retval = 0; \ - might_sleep(); \ - if (current->mm) \ - might_lock_read(¤t->mm->mmap_sem); \ + might_fault(); \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ diff --git a/include/asm-x86/uaccess_32.h b/include/asm-x86/uaccess_32.h index d725e2d703f7..d10e842ec3ee 100644 --- a/include/asm-x86/uaccess_32.h +++ b/include/asm-x86/uaccess_32.h @@ -82,9 +82,7 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) static __always_inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n) { - might_sleep(); - if (current->mm) - might_lock_read(¤t->mm->mmap_sem); + might_fault(); return __copy_to_user_inatomic(to, from, n); } @@ -139,9 +137,7 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) static __always_inline unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { - might_sleep(); - if (current->mm) - might_lock_read(¤t->mm->mmap_sem); + might_fault(); if (__builtin_constant_p(n)) { unsigned long ret; @@ -163,9 +159,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n) static __always_inline unsigned long __copy_from_user_nocache(void *to, const void __user *from, unsigned long n) { - might_sleep(); - if (current->mm) - might_lock_read(¤t->mm->mmap_sem); + might_fault(); if (__builtin_constant_p(n)) { unsigned long ret; diff --git a/include/asm-x86/uaccess_64.h b/include/asm-x86/uaccess_64.h index 40a7205fe576..13fd56fbc3ab 100644 --- a/include/asm-x86/uaccess_64.h +++ b/include/asm-x86/uaccess_64.h @@ -29,9 +29,7 @@ int __copy_from_user(void *dst, const void __user *src, unsigned size) { int ret = 0; - might_sleep(); - if (current->mm) - might_lock_read(¤t->mm->mmap_sem); + might_fault(); if (!__builtin_constant_p(size)) return copy_user_generic(dst, (__force void *)src, size); switch (size) { @@ -75,9 +73,7 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size) { int ret = 0; - might_sleep(); - if (current->mm) - might_lock_read(¤t->mm->mmap_sem); + might_fault(); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, src, size); switch (size) { @@ -121,9 +117,7 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size) { int ret = 0; - might_sleep(); - if (current->mm) - might_lock_read(¤t->mm->mmap_sem); + might_fault(); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, (__force void *)src, size); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2651f805ba6d..e580ec095765 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -140,6 +140,15 @@ extern int _cond_resched(void); (__x < 0) ? -__x : __x; \ }) +#ifdef CONFIG_PROVE_LOCKING +void might_fault(void); +#else +static inline void might_fault(void) +{ + might_sleep(); +} +#endif + extern struct atomic_notifier_head panic_notifier_list; extern long (*panic_blink)(long time); NORET_TYPE void panic(const char * fmt, ...) diff --git a/mm/memory.c b/mm/memory.c index 1002f473f497..b8fdf4e5e65b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3016,3 +3016,18 @@ void print_vma_addr(char *prefix, unsigned long ip) } up_read(¤t->mm->mmap_sem); } + +#ifdef CONFIG_PROVE_LOCKING +void might_fault(void) +{ + might_sleep(); + /* + * it would be nicer only to annotate paths which are not under + * pagefault_disable, however that requires a larger audit and + * providing helpers like get_user_atomic. + */ + if (!in_atomic() && current->mm) + might_lock_read(¤t->mm->mmap_sem); +} +EXPORT_SYMBOL(might_fault); +#endif -- cgit v1.2.3 From 1d18ef489509314506328b9e464dd47c24c1d68f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Sep 2008 20:53:21 +0200 Subject: x86: some lock annotations for user copy paths, v3 - add annotation back to clear_user() - change probe_kernel_address() to _inatomic*() method Signed-off-by: Ingo Molnar --- arch/x86/lib/usercopy_32.c | 1 + include/asm-x86/uaccess.h | 2 -- include/linux/uaccess.h | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 7393152a252e..fab5faba1d3e 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -148,6 +148,7 @@ do { \ unsigned long clear_user(void __user *to, unsigned long n) { + might_fault(); if (access_ok(VERIFY_WRITE, to, n)) __do_clear_user(to, n); return n; diff --git a/include/asm-x86/uaccess.h b/include/asm-x86/uaccess.h index 39f8420c75d9..dc8edb5c4659 100644 --- a/include/asm-x86/uaccess.h +++ b/include/asm-x86/uaccess.h @@ -267,7 +267,6 @@ extern void __put_user_8(void); #define __put_user_size(x, ptr, size, retval, errret) \ do { \ retval = 0; \ - might_fault(); \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ @@ -320,7 +319,6 @@ do { \ #define __get_user_size(x, ptr, size, retval, errret) \ do { \ retval = 0; \ - might_fault(); \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index fec6decfb983..2062293e57e6 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -78,7 +78,7 @@ static inline unsigned long __copy_from_user_nocache(void *to, \ set_fs(KERNEL_DS); \ pagefault_disable(); \ - ret = __get_user(retval, (__force typeof(retval) __user *)(addr)); \ + ret = __copy_from_user_inatomic((__force typeof(retval) __user *)(addr), &(retval), sizeof(retval)); \ pagefault_enable(); \ set_fs(old_fs); \ ret; \ -- cgit v1.2.3 From 53b9d87f41a3d8838210ad7cdef02d814817ce85 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 11 Sep 2008 17:02:58 -0700 Subject: lock debug: sit tight when we are already in a panic in: > http://bugzilla.kernel.org/show_bug.cgi?id=11543 The panic code called the kexec code which called mutex_trylock() which called spin_lock_mutex() which then stupidly went and blurted a load of debug stuff because of in_interrupt(). Keep the lock debug code from escallating an already crappy situation. Signed-off-by: Ingo Molnar --- include/linux/debug_locks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h index 4aaa4afb1cb9..096476f1fb35 100644 --- a/include/linux/debug_locks.h +++ b/include/linux/debug_locks.h @@ -17,7 +17,7 @@ extern int debug_locks_off(void); ({ \ int __ret = 0; \ \ - if (unlikely(c)) { \ + if (!oops_in_progress && unlikely(c)) { \ if (debug_locks_off() && !debug_locks_silent) \ WARN_ON(1); \ __ret = 1; \ -- cgit v1.2.3 From 30742d5c2277c325fb0e9d2d817d55a19995fe8f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Sep 2008 14:43:39 +0200 Subject: Revert "lockdep: fix compilation when CONFIG_TRACE_IRQFLAGS_SUPPORT is not set" This reverts commit bd8fbdee6562ee526f3c2582a3b373ef195015dd. This broke the powerpc build - more fixes are needed before we can undo this revert. --- include/linux/irqflags.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index f2993512b3b5..74bde13224c9 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -52,10 +52,10 @@ # define start_critical_timings() do { } while (0) #endif -#include - #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT +#include + #define local_irq_enable() \ do { trace_hardirqs_on(); raw_local_irq_enable(); } while (0) #define local_irq_disable() \ @@ -84,20 +84,21 @@ * The local_irq_*() APIs are equal to the raw_local_irq*() * if !TRACE_IRQFLAGS. */ -#define local_irq_disable() raw_local_irq_disable() -#define local_irq_enable() raw_local_irq_enable() -#define local_irq_save(flags) \ +# define raw_local_irq_disable() local_irq_disable() +# define raw_local_irq_enable() local_irq_enable() +# define raw_local_irq_save(flags) \ do { \ typecheck(unsigned long, flags); \ - raw_local_irq_save(flags); \ + local_irq_save(flags); \ } while (0) -# define local_irq_restore(flags) \ +# define raw_local_irq_restore(flags) \ do { \ typecheck(unsigned long, flags); \ - raw_local_irq_restore(flags); \ + local_irq_restore(flags); \ } while (0) #endif /* CONFIG_TRACE_IRQFLAGS_SUPPORT */ +#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT #define safe_halt() \ do { \ trace_hardirqs_on(); \ @@ -123,5 +124,6 @@ typecheck(unsigned long, flags); \ raw_irqs_disabled_flags(flags); \ }) +#endif /* CONFIG_X86 */ #endif -- cgit v1.2.3 From fb71e45338453698bd7460f7e8f171ea0304d218 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 15 Sep 2008 18:04:26 -0700 Subject: uaccess: fix parameters inversion for __copy_from_user_inatomic() The following patch changes to use __copy_from_user_inatomic(), but the passing parameters incorrect: x86: some lock annotations for user copy paths, v3 This fixes the netfilter crash reported by Steven Noonan. Reported-by: Steven Noonan Signed-off-by: Hiroshi Shimamoto Tested-by: Steven Noonan Signed-off-by: Ingo Molnar --- include/linux/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 2062293e57e6..6b58367d145e 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -78,7 +78,7 @@ static inline unsigned long __copy_from_user_nocache(void *to, \ set_fs(KERNEL_DS); \ pagefault_disable(); \ - ret = __copy_from_user_inatomic((__force typeof(retval) __user *)(addr), &(retval), sizeof(retval)); \ + ret = __copy_from_user_inatomic(&(retval), (__force typeof(retval) __user *)(addr), sizeof(retval)); \ pagefault_enable(); \ set_fs(old_fs); \ ret; \ -- cgit v1.2.3 From 6918bc5c830e890681eabb3c6cb6b8d117a52d14 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Sep 2008 15:33:41 +0200 Subject: lockstat: fixup signed division Some recent modification to this code made me notice the little todo mark. Now that we have more elaborate 64-bit division functions this isn't hard. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/lockdep_proc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 20dbcbf9c7dd..8d3a6eba8d5a 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -470,11 +470,12 @@ static void seq_line(struct seq_file *m, char c, int offset, int length) static void snprint_time(char *buf, size_t bufsiz, s64 nr) { - unsigned long rem; + s64 div; + s32 rem; nr += 5; /* for display rounding */ - rem = do_div(nr, 1000); /* XXX: do_div_signed */ - snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10); + div = div_s64_rem(nr, 1000, &rem); + snprintf(buf, bufsiz, "%lld.%02d", (long long)div, (int)rem/10); } static void seq_time(struct seq_file *m, s64 time) -- cgit v1.2.3 From 38d47c1b7075bd7ec3881141bb3629da58f88dab Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Sep 2008 19:32:20 +0200 Subject: futex: rely on get_user_pages() for shared futexes On the way of getting rid of the mmap_sem requirement for shared futexes, start by relying on get_user_pages(). Signed-off-by: Peter Zijlstra Acked-by: Nick Piggin Signed-off-by: Ingo Molnar --- include/linux/futex.h | 2 + kernel/futex.c | 162 +++++++++++++++++++++++++------------------------- 2 files changed, 82 insertions(+), 82 deletions(-) diff --git a/include/linux/futex.h b/include/linux/futex.h index 586ab56a3ec3..8f627b9ae2b1 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -164,6 +164,8 @@ union futex_key { } both; }; +#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } } + #ifdef CONFIG_FUTEX extern void exit_robust_list(struct task_struct *curr); extern void exit_pi_state_list(struct task_struct *curr); diff --git a/kernel/futex.c b/kernel/futex.c index 7d1136e97c14..a4c39fa0a7a3 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -161,6 +161,45 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) && key1->both.offset == key2->both.offset); } +/* + * Take a reference to the resource addressed by a key. + * Can be called while holding spinlocks. + * + */ +static void get_futex_key_refs(union futex_key *key) +{ + if (!key->both.ptr) + return; + + switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { + case FUT_OFF_INODE: + atomic_inc(&key->shared.inode->i_count); + break; + case FUT_OFF_MMSHARED: + atomic_inc(&key->private.mm->mm_count); + break; + } +} + +/* + * Drop a reference to the resource addressed by a key. + * The hash bucket spinlock must not be held. + */ +static void drop_futex_key_refs(union futex_key *key) +{ + if (!key->both.ptr) + return; + + switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { + case FUT_OFF_INODE: + iput(key->shared.inode); + break; + case FUT_OFF_MMSHARED: + mmdrop(key->private.mm); + break; + } +} + /** * get_futex_key - Get parameters which are the keys for a futex. * @uaddr: virtual address of the futex @@ -184,7 +223,6 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; struct page *page; int err; @@ -210,98 +248,47 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, key->private.address = address; return 0; } - /* - * The futex is hashed differently depending on whether - * it's in a shared or private mapping. So check vma first. - */ - vma = find_extend_vma(mm, address); - if (unlikely(!vma)) - return -EFAULT; - /* - * Permissions. - */ - if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) - return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; +again: + err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL); + if (err < 0) + return err; + + lock_page(page); + if (!page->mapping) { + unlock_page(page); + put_page(page); + goto again; + } /* * Private mappings are handled in a simple way. * * NOTE: When userspace waits on a MAP_SHARED mapping, even if * it's a read-only handle, it's expected that futexes attach to - * the object not the particular process. Therefore we use - * VM_MAYSHARE here, not VM_SHARED which is restricted to shared - * mappings of _writable_ handles. + * the object not the particular process. */ - if (likely(!(vma->vm_flags & VM_MAYSHARE))) { - key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */ + if (PageAnon(page)) { + key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ key->private.mm = mm; key->private.address = address; - return 0; - } - - /* - * Linear file mappings are also simple. - */ - key->shared.inode = vma->vm_file->f_path.dentry->d_inode; - key->both.offset |= FUT_OFF_INODE; /* inode-based key. */ - if (likely(!(vma->vm_flags & VM_NONLINEAR))) { - key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) - + vma->vm_pgoff); - return 0; + } else { + key->both.offset |= FUT_OFF_INODE; /* inode-based key */ + key->shared.inode = page->mapping->host; + key->shared.pgoff = page->index; } - /* - * We could walk the page table to read the non-linear - * pte, and get the page index without fetching the page - * from swap. But that's a lot of code to duplicate here - * for a rare case, so we simply fetch the page. - */ - err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL); - if (err >= 0) { - key->shared.pgoff = - page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - put_page(page); - return 0; - } - return err; -} + get_futex_key_refs(key); -/* - * Take a reference to the resource addressed by a key. - * Can be called while holding spinlocks. - * - */ -static void get_futex_key_refs(union futex_key *key) -{ - if (key->both.ptr == NULL) - return; - switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { - case FUT_OFF_INODE: - atomic_inc(&key->shared.inode->i_count); - break; - case FUT_OFF_MMSHARED: - atomic_inc(&key->private.mm->mm_count); - break; - } + unlock_page(page); + put_page(page); + return 0; } -/* - * Drop a reference to the resource addressed by a key. - * The hash bucket spinlock must not be held. - */ -static void drop_futex_key_refs(union futex_key *key) +static inline +void put_futex_key(struct rw_semaphore *fshared, union futex_key *key) { - if (!key->both.ptr) - return; - switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { - case FUT_OFF_INODE: - iput(key->shared.inode); - break; - case FUT_OFF_MMSHARED: - mmdrop(key->private.mm); - break; - } + drop_futex_key_refs(key); } static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) @@ -385,6 +372,7 @@ static int refill_pi_state_cache(void) /* pi_mutex gets initialized later */ pi_state->owner = NULL; atomic_set(&pi_state->refcount, 1); + pi_state->key = FUTEX_KEY_INIT; current->pi_state_cache = pi_state; @@ -462,7 +450,7 @@ void exit_pi_state_list(struct task_struct *curr) struct list_head *next, *head = &curr->pi_state_list; struct futex_pi_state *pi_state; struct futex_hash_bucket *hb; - union futex_key key; + union futex_key key = FUTEX_KEY_INIT; if (!futex_cmpxchg_enabled) return; @@ -725,7 +713,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, struct futex_hash_bucket *hb; struct futex_q *this, *next; struct plist_head *head; - union futex_key key; + union futex_key key = FUTEX_KEY_INIT; int ret; if (!bitset) @@ -760,6 +748,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, spin_unlock(&hb->lock); out: + put_futex_key(fshared, &key); futex_unlock_mm(fshared); return ret; } @@ -773,7 +762,7 @@ futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, u32 __user *uaddr2, int nr_wake, int nr_wake2, int op) { - union futex_key key1, key2; + union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; struct futex_hash_bucket *hb1, *hb2; struct plist_head *head; struct futex_q *this, *next; @@ -873,6 +862,8 @@ retry: if (hb1 != hb2) spin_unlock(&hb2->lock); out: + put_futex_key(fshared, &key2); + put_futex_key(fshared, &key1); futex_unlock_mm(fshared); return ret; @@ -886,7 +877,7 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, u32 __user *uaddr2, int nr_wake, int nr_requeue, u32 *cmpval) { - union futex_key key1, key2; + union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; struct futex_hash_bucket *hb1, *hb2; struct plist_head *head1; struct futex_q *this, *next; @@ -974,6 +965,8 @@ out_unlock: drop_futex_key_refs(&key1); out: + put_futex_key(fshared, &key2); + put_futex_key(fshared, &key1); futex_unlock_mm(fshared); return ret; } @@ -1220,6 +1213,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, retry: futex_lock_mm(fshared); + q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) goto out_release_sem; @@ -1360,6 +1354,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, queue_unlock(&q, hb); out_release_sem: + put_futex_key(fshared, &q.key); futex_unlock_mm(fshared); return ret; } @@ -1411,6 +1406,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, retry: futex_lock_mm(fshared); + q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) goto out_release_sem; @@ -1625,6 +1621,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, queue_unlock(&q, hb); out_release_sem: + put_futex_key(fshared, &q.key); futex_unlock_mm(fshared); if (to) destroy_hrtimer_on_stack(&to->timer); @@ -1671,7 +1668,7 @@ static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared) struct futex_q *this, *next; u32 uval; struct plist_head *head; - union futex_key key; + union futex_key key = FUTEX_KEY_INIT; int ret, attempt = 0; retry: @@ -1744,6 +1741,7 @@ retry_unlocked: out_unlock: spin_unlock(&hb->lock); out: + put_futex_key(fshared, &key); futex_unlock_mm(fshared); return ret; -- cgit v1.2.3 From 61270708ecf1cda148e84fbf6e0703ee5aa81814 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Sep 2008 19:32:21 +0200 Subject: futex: reduce mmap_sem usage now that we rely on get_user_pages() for the shared key handling move all the mmap_sem stuff closely around the slow paths. Signed-off-by: Peter Zijlstra Acked-by: Nick Piggin Signed-off-by: Ingo Molnar --- kernel/futex.c | 83 +++------------------------------------------------------- 1 file changed, 4 insertions(+), 79 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index a4c39fa0a7a3..6a726684217e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -122,24 +122,6 @@ struct futex_hash_bucket { static struct futex_hash_bucket futex_queues[1<mmap_sem, when futex is shared - */ -static inline void futex_lock_mm(struct rw_semaphore *fshared) -{ - if (fshared) - down_read(fshared); -} - -/* - * Release mm->mmap_sem, when the futex is shared - */ -static inline void futex_unlock_mm(struct rw_semaphore *fshared) -{ - if (fshared) - up_read(fshared); -} - /* * We hash on the keys returned from get_futex_key (see below). */ @@ -250,7 +232,9 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, } again: + down_read(&mm->mmap_sem); err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL); + up_read(&mm->mmap_sem); if (err < 0) return err; @@ -327,8 +311,7 @@ static int futex_handle_fault(unsigned long address, if (attempt > 2) return ret; - if (!fshared) - down_read(&mm->mmap_sem); + down_read(&mm->mmap_sem); vma = find_vma(mm, address); if (vma && address >= vma->vm_start && (vma->vm_flags & VM_WRITE)) { @@ -348,8 +331,7 @@ static int futex_handle_fault(unsigned long address, current->min_flt++; } } - if (!fshared) - up_read(&mm->mmap_sem); + up_read(&mm->mmap_sem); return ret; } @@ -719,8 +701,6 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, if (!bitset) return -EINVAL; - futex_lock_mm(fshared); - ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) goto out; @@ -749,7 +729,6 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, spin_unlock(&hb->lock); out: put_futex_key(fshared, &key); - futex_unlock_mm(fshared); return ret; } @@ -769,8 +748,6 @@ futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, int ret, op_ret, attempt = 0; retryfull: - futex_lock_mm(fshared); - ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; @@ -821,12 +798,6 @@ retry: goto retry; } - /* - * If we would have faulted, release mmap_sem, - * fault it in and start all over again. - */ - futex_unlock_mm(fshared); - ret = get_user(dummy, uaddr2); if (ret) return ret; @@ -864,7 +835,6 @@ retry: out: put_futex_key(fshared, &key2); put_futex_key(fshared, &key1); - futex_unlock_mm(fshared); return ret; } @@ -884,8 +854,6 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, int ret, drop_count = 0; retry: - futex_lock_mm(fshared); - ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; @@ -908,12 +876,6 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, if (hb1 != hb2) spin_unlock(&hb2->lock); - /* - * If we would have faulted, release mmap_sem, fault - * it in and start all over again. - */ - futex_unlock_mm(fshared); - ret = get_user(curval, uaddr1); if (!ret) @@ -967,7 +929,6 @@ out_unlock: out: put_futex_key(fshared, &key2); put_futex_key(fshared, &key1); - futex_unlock_mm(fshared); return ret; } @@ -1211,8 +1172,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, q.pi_state = NULL; q.bitset = bitset; retry: - futex_lock_mm(fshared); - q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) @@ -1245,12 +1204,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, if (unlikely(ret)) { queue_unlock(&q, hb); - /* - * If we would have faulted, release mmap_sem, fault it in and - * start all over again. - */ - futex_unlock_mm(fshared); - ret = get_user(uval, uaddr); if (!ret) @@ -1264,12 +1217,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, /* Only actually queue if *uaddr contained val. */ queue_me(&q, hb); - /* - * Now the futex is queued and we have checked the data, we - * don't want to hold mmap_sem while we sleep. - */ - futex_unlock_mm(fshared); - /* * There might have been scheduling since the queue_me(), as we * cannot hold a spinlock across the get_user() in case it @@ -1355,7 +1302,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, out_release_sem: put_futex_key(fshared, &q.key); - futex_unlock_mm(fshared); return ret; } @@ -1404,8 +1350,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, q.pi_state = NULL; retry: - futex_lock_mm(fshared); - q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) @@ -1495,7 +1439,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, * exit to complete. */ queue_unlock(&q, hb); - futex_unlock_mm(fshared); cond_resched(); goto retry; @@ -1527,12 +1470,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, */ queue_me(&q, hb); - /* - * Now the futex is queued and we have checked the data, we - * don't want to hold mmap_sem while we sleep. - */ - futex_unlock_mm(fshared); - WARN_ON(!q.pi_state); /* * Block on the PI mutex: @@ -1545,7 +1482,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, ret = ret ? 0 : -EWOULDBLOCK; } - futex_lock_mm(fshared); spin_lock(q.lock_ptr); if (!ret) { @@ -1611,7 +1547,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, /* Unqueue and drop the lock */ unqueue_me_pi(&q); - futex_unlock_mm(fshared); if (to) destroy_hrtimer_on_stack(&to->timer); @@ -1622,7 +1557,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, out_release_sem: put_futex_key(fshared, &q.key); - futex_unlock_mm(fshared); if (to) destroy_hrtimer_on_stack(&to->timer); return ret; @@ -1646,8 +1580,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, goto retry_unlocked; } - futex_unlock_mm(fshared); - ret = get_user(uval, uaddr); if (!ret && (uval != -EFAULT)) goto retry; @@ -1679,10 +1611,6 @@ retry: */ if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) return -EPERM; - /* - * First take all the futex related locks: - */ - futex_lock_mm(fshared); ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) @@ -1742,7 +1670,6 @@ out_unlock: spin_unlock(&hb->lock); out: put_futex_key(fshared, &key); - futex_unlock_mm(fshared); return ret; @@ -1766,8 +1693,6 @@ pi_faulted: goto retry_unlocked; } - futex_unlock_mm(fshared); - ret = get_user(uval, uaddr); if (!ret && (uval != -EFAULT)) goto retry; -- cgit v1.2.3 From 734b05b10e51d4ba38c8fc3ee02e846aab09eedf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Sep 2008 19:32:22 +0200 Subject: futex: use fast_gup() Change the get_user_pages() call with fast_gup() which doesn't require holding the mmap_sem thereby removing the mmap_sem from all fast paths. Signed-off-by: Peter Zijlstra Acked-by: Nick Piggin Signed-off-by: Ingo Molnar --- kernel/futex.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 6a726684217e..facf17d1a705 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -232,9 +232,7 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, } again: - down_read(&mm->mmap_sem); - err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL); - up_read(&mm->mmap_sem); + err = get_user_pages_fast(address, 1, 0, &page); if (err < 0) return err; -- cgit v1.2.3 From c2f9f20154bfb137ccdf8c9159992429a40dfe20 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Sep 2008 19:32:23 +0200 Subject: futex: cleanup fshared fshared doesn't need to be a rw_sem pointer anymore, so clean that up. Signed-off-by: Peter Zijlstra Acked-by: Nick Piggin Signed-off-by: Ingo Molnar --- kernel/futex.c | 48 +++++++++++++++++++----------------------------- 1 file changed, 19 insertions(+), 29 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index facf17d1a705..60b47bb9e3dd 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -200,8 +200,7 @@ static void drop_futex_key_refs(union futex_key *key) * For other futexes, it points to ¤t->mm->mmap_sem and * caller must have taken the reader lock. but NOT any spinlocks. */ -static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, - union futex_key *key) +static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -268,7 +267,7 @@ again: } static inline -void put_futex_key(struct rw_semaphore *fshared, union futex_key *key) +void put_futex_key(int fshared, union futex_key *key) { drop_futex_key_refs(key); } @@ -297,10 +296,8 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from) /* * Fault handling. - * if fshared is non NULL, current->mm->mmap_sem is already held */ -static int futex_handle_fault(unsigned long address, - struct rw_semaphore *fshared, int attempt) +static int futex_handle_fault(unsigned long address, int attempt) { struct vm_area_struct * vma; struct mm_struct *mm = current->mm; @@ -687,8 +684,7 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: */ -static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, - int nr_wake, u32 bitset) +static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) { struct futex_hash_bucket *hb; struct futex_q *this, *next; @@ -735,8 +731,7 @@ out: * to this virtual address: */ static int -futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, - u32 __user *uaddr2, +futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, int nr_wake, int nr_wake2, int op) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; @@ -790,7 +785,7 @@ retry: */ if (attempt++) { ret = futex_handle_fault((unsigned long)uaddr2, - fshared, attempt); + attempt); if (ret) goto out; goto retry; @@ -841,8 +836,7 @@ out: * Requeue all waiters hashed on one physical page to another * physical page. */ -static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, - u32 __user *uaddr2, +static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, int nr_wake, int nr_requeue, u32 *cmpval) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; @@ -1048,8 +1042,7 @@ static void unqueue_me_pi(struct futex_q *q) * private futexes. */ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *newowner, - struct rw_semaphore *fshared) + struct task_struct *newowner, int fshared) { u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; @@ -1128,7 +1121,7 @@ retry: handle_fault: spin_unlock(q->lock_ptr); - ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); + ret = futex_handle_fault((unsigned long)uaddr, attempt++); spin_lock(q->lock_ptr); @@ -1152,7 +1145,7 @@ handle_fault: static long futex_wait_restart(struct restart_block *restart); -static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, +static int futex_wait(u32 __user *uaddr, int fshared, u32 val, ktime_t *abs_time, u32 bitset) { struct task_struct *curr = current; @@ -1307,13 +1300,13 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; - struct rw_semaphore *fshared = NULL; + int fshared = 0; ktime_t t; t.tv64 = restart->futex.time; restart->fn = do_no_restart_syscall; if (restart->futex.flags & FLAGS_SHARED) - fshared = ¤t->mm->mmap_sem; + fshared = 1; return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, restart->futex.bitset); } @@ -1325,7 +1318,7 @@ static long futex_wait_restart(struct restart_block *restart) * if there are waiters then it will block, it does PI, etc. (Due to * races the kernel might see a 0 value of the futex too.) */ -static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, +static int futex_lock_pi(u32 __user *uaddr, int fshared, int detect, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to = NULL; @@ -1571,8 +1564,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, queue_unlock(&q, hb); if (attempt++) { - ret = futex_handle_fault((unsigned long)uaddr, fshared, - attempt); + ret = futex_handle_fault((unsigned long)uaddr, attempt); if (ret) goto out_release_sem; goto retry_unlocked; @@ -1592,7 +1584,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, * This is the in-kernel slowpath: we look up the PI state (if any), * and do the rt-mutex unlock. */ -static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared) +static int futex_unlock_pi(u32 __user *uaddr, int fshared) { struct futex_hash_bucket *hb; struct futex_q *this, *next; @@ -1683,8 +1675,7 @@ pi_faulted: spin_unlock(&hb->lock); if (attempt++) { - ret = futex_handle_fault((unsigned long)uaddr, fshared, - attempt); + ret = futex_handle_fault((unsigned long)uaddr, attempt); if (ret) goto out; uval = 0; @@ -1816,8 +1807,7 @@ retry: * PI futexes happens in exit_pi_state(): */ if (!pi && (uval & FUTEX_WAITERS)) - futex_wake(uaddr, &curr->mm->mmap_sem, 1, - FUTEX_BITSET_MATCH_ANY); + futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); } return 0; } @@ -1913,10 +1903,10 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, { int ret = -ENOSYS; int cmd = op & FUTEX_CMD_MASK; - struct rw_semaphore *fshared = NULL; + int fshared = 0; if (!(op & FUTEX_PRIVATE_FLAG)) - fshared = ¤t->mm->mmap_sem; + fshared = 1; switch (cmd) { case FUTEX_WAIT: -- cgit v1.2.3 From 42569c39917a08e8de1e8b5685463be7b74baebd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 30 Sep 2008 12:33:07 +0200 Subject: futex: fixup get_futex_key() for private futexes With the get_user_pages_fast() patches we made get_futex_key() obtain a reference on the returned key, but failed to do so for private futexes. Signed-off-by: Peter Zijlstra Acked-by: Nick Piggin Signed-off-by: Ingo Molnar --- kernel/futex.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/futex.c b/kernel/futex.c index 60b47bb9e3dd..62cbd648e28a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -227,6 +227,7 @@ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) return -EFAULT; key->private.mm = mm; key->private.address = address; + get_futex_key_refs(key); return 0; } -- cgit v1.2.3 From 7317d7b87edb41a9135e30be1ec3f7ef817c53dd Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 30 Sep 2008 20:50:27 +1000 Subject: sched: improve preempt debugging This patch helped me out with a problem I recently had.... Basically, when the kernel lock is held, then preempt_count underflow does not get detected until it is released which may be a long time (and arbitrarily, eg at different points it may be rescheduled). If the bkl is released at schedule, the resulting output is actually fairly cryptic... With any other lock that elevates preempt_count, it is illegal to schedule under it (which would get found pretty quickly). bkl allows scheduling with preempt_count elevated, which makes underflows hard to debug. Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index 98890807375b..ec3bd1f398b3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4305,7 +4305,7 @@ void __kprobes sub_preempt_count(int val) /* * Underflow? */ - if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) + if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked()))) return; /* * Is the spinlock portion underflowing? -- cgit v1.2.3 From c7e78cff6b7518212247fb20b1dc6411540dc9af Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Oct 2008 23:17:09 +0200 Subject: lockstat: contend with points We currently only provide points that have to wait on contention, also lists the points we have to wait for. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- Documentation/lockstat.txt | 50 +++++++++++++++++++++++++++++----------------- include/linux/lockdep.h | 13 ++++++++---- kernel/lockdep.c | 33 +++++++++++++++++++----------- kernel/lockdep_proc.c | 21 +++++++++++++++++-- kernel/mutex.c | 2 +- 5 files changed, 82 insertions(+), 37 deletions(-) diff --git a/Documentation/lockstat.txt b/Documentation/lockstat.txt index 02f36f5c64fe..9cb9138f7a79 100644 --- a/Documentation/lockstat.txt +++ b/Documentation/lockstat.txt @@ -71,34 +71,48 @@ Look at the current lock statistics: # less /proc/lock_stat -01 lock_stat version 0.2 +01 lock_stat version 0.3 02 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 03 class name con-bounces contentions waittime-min waittime-max waittime-total acq-bounces acquisitions holdtime-min holdtime-max holdtime-total 04 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 05 -06 &inode->i_data.tree_lock-W: 15 21657 0.18 1093295.30 11547131054.85 58 10415 0.16 87.51 6387.60 -07 &inode->i_data.tree_lock-R: 0 0 0.00 0.00 0.00 23302 231198 0.25 8.45 98023.38 -08 -------------------------- -09 &inode->i_data.tree_lock 0 [] add_to_page_cache+0x5f/0x190 -10 -11 ............................................................................................................................................................................................... -12 -13 dcache_lock: 1037 1161 0.38 45.32 774.51 6611 243371 0.15 306.48 77387.24 -14 ----------- -15 dcache_lock 180 [] sys_getcwd+0x11e/0x230 -16 dcache_lock 165 [] d_alloc+0x15a/0x210 -17 dcache_lock 33 [] _atomic_dec_and_lock+0x4d/0x70 -18 dcache_lock 1 [] shrink_dcache_parent+0x18/0x130 +06 &mm->mmap_sem-W: 233 538 18446744073708 22924.27 607243.51 1342 45806 1.71 8595.89 1180582.34 +07 &mm->mmap_sem-R: 205 587 18446744073708 28403.36 731975.00 1940 412426 0.58 187825.45 6307502.88 +08 --------------- +09 &mm->mmap_sem 487 [] do_page_fault+0x466/0x928 +10 &mm->mmap_sem 179 [] sys_mprotect+0xcd/0x21d +11 &mm->mmap_sem 279 [] sys_mmap+0x75/0xce +12 &mm->mmap_sem 76 [] sys_munmap+0x32/0x59 +13 --------------- +14 &mm->mmap_sem 270 [] sys_mmap+0x75/0xce +15 &mm->mmap_sem 431 [] do_page_fault+0x466/0x928 +16 &mm->mmap_sem 138 [] sys_munmap+0x32/0x59 +17 &mm->mmap_sem 145 [] sys_mprotect+0xcd/0x21d +18 +19 ............................................................................................................................................................................................... +20 +21 dcache_lock: 621 623 0.52 118.26 1053.02 6745 91930 0.29 316.29 118423.41 +22 ----------- +23 dcache_lock 179 [] _atomic_dec_and_lock+0x34/0x54 +24 dcache_lock 113 [] d_alloc+0x19a/0x1eb +25 dcache_lock 99 [] d_rehash+0x1b/0x44 +26 dcache_lock 104 [] d_instantiate+0x36/0x8a +27 ----------- +28 dcache_lock 192 [] _atomic_dec_and_lock+0x34/0x54 +29 dcache_lock 98 [] d_rehash+0x1b/0x44 +30 dcache_lock 72 [] d_alloc+0x19a/0x1eb +31 dcache_lock 112 [] d_instantiate+0x36/0x8a This excerpt shows the first two lock class statistics. Line 01 shows the output version - each time the format changes this will be updated. Line 02-04 -show the header with column descriptions. Lines 05-10 and 13-18 show the actual +show the header with column descriptions. Lines 05-18 and 20-31 show the actual statistics. These statistics come in two parts; the actual stats separated by a -short separator (line 08, 14) from the contention points. +short separator (line 08, 13) from the contention points. -The first lock (05-10) is a read/write lock, and shows two lines above the +The first lock (05-18) is a read/write lock, and shows two lines above the short separator. The contention points don't match the column descriptors, -they have two: contentions and [] symbol. +they have two: contentions and [] symbol. The second set of contention +points are the points we're contending with. The integer part of the time values is in us. diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 0aa657aa8a1e..fc9f8e88123b 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -73,6 +73,8 @@ struct lock_class_key { struct lockdep_subclass_key subkeys[MAX_LOCKDEP_SUBCLASSES]; }; +#define LOCKSTAT_POINTS 4 + /* * The lock-class itself: */ @@ -119,7 +121,8 @@ struct lock_class { int name_version; #ifdef CONFIG_LOCK_STAT - unsigned long contention_point[4]; + unsigned long contention_point[LOCKSTAT_POINTS]; + unsigned long contending_point[LOCKSTAT_POINTS]; #endif }; @@ -144,6 +147,7 @@ enum bounce_type { struct lock_class_stats { unsigned long contention_point[4]; + unsigned long contending_point[4]; struct lock_time read_waittime; struct lock_time write_waittime; struct lock_time read_holdtime; @@ -165,6 +169,7 @@ struct lockdep_map { const char *name; #ifdef CONFIG_LOCK_STAT int cpu; + unsigned long ip; #endif }; @@ -355,7 +360,7 @@ struct lock_class_key { }; #ifdef CONFIG_LOCK_STAT extern void lock_contended(struct lockdep_map *lock, unsigned long ip); -extern void lock_acquired(struct lockdep_map *lock); +extern void lock_acquired(struct lockdep_map *lock, unsigned long ip); #define LOCK_CONTENDED(_lock, try, lock) \ do { \ @@ -363,13 +368,13 @@ do { \ lock_contended(&(_lock)->dep_map, _RET_IP_); \ lock(_lock); \ } \ - lock_acquired(&(_lock)->dep_map); \ + lock_acquired(&(_lock)->dep_map, _RET_IP_); \ } while (0) #else /* CONFIG_LOCK_STAT */ #define lock_contended(lockdep_map, ip) do {} while (0) -#define lock_acquired(lockdep_map) do {} while (0) +#define lock_acquired(lockdep_map, ip) do {} while (0) #define LOCK_CONTENDED(_lock, try, lock) \ lock(_lock) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index dbda475b13bd..234a9dccb4be 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -136,16 +136,16 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock) #ifdef CONFIG_LOCK_STAT static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); -static int lock_contention_point(struct lock_class *class, unsigned long ip) +static int lock_point(unsigned long points[], unsigned long ip) { int i; - for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { - if (class->contention_point[i] == 0) { - class->contention_point[i] = ip; + for (i = 0; i < LOCKSTAT_POINTS; i++) { + if (points[i] == 0) { + points[i] = ip; break; } - if (class->contention_point[i] == ip) + if (points[i] == ip) break; } @@ -185,6 +185,9 @@ struct lock_class_stats lock_stats(struct lock_class *class) for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) stats.contention_point[i] += pcs->contention_point[i]; + for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++) + stats.contending_point[i] += pcs->contending_point[i]; + lock_time_add(&pcs->read_waittime, &stats.read_waittime); lock_time_add(&pcs->write_waittime, &stats.write_waittime); @@ -209,6 +212,7 @@ void clear_lock_stats(struct lock_class *class) memset(cpu_stats, 0, sizeof(struct lock_class_stats)); } memset(class->contention_point, 0, sizeof(class->contention_point)); + memset(class->contending_point, 0, sizeof(class->contending_point)); } static struct lock_class_stats *get_lock_stats(struct lock_class *class) @@ -3001,7 +3005,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) struct held_lock *hlock, *prev_hlock; struct lock_class_stats *stats; unsigned int depth; - int i, point; + int i, contention_point, contending_point; depth = curr->lockdep_depth; if (DEBUG_LOCKS_WARN_ON(!depth)) @@ -3025,18 +3029,22 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) found_it: hlock->waittime_stamp = sched_clock(); - point = lock_contention_point(hlock_class(hlock), ip); + contention_point = lock_point(hlock_class(hlock)->contention_point, ip); + contending_point = lock_point(hlock_class(hlock)->contending_point, + lock->ip); stats = get_lock_stats(hlock_class(hlock)); - if (point < ARRAY_SIZE(stats->contention_point)) - stats->contention_point[point]++; + if (contention_point < LOCKSTAT_POINTS) + stats->contention_point[contention_point]++; + if (contending_point < LOCKSTAT_POINTS) + stats->contending_point[contending_point]++; if (lock->cpu != smp_processor_id()) stats->bounces[bounce_contended + !!hlock->read]++; put_lock_stats(stats); } static void -__lock_acquired(struct lockdep_map *lock) +__lock_acquired(struct lockdep_map *lock, unsigned long ip) { struct task_struct *curr = current; struct held_lock *hlock, *prev_hlock; @@ -3085,6 +3093,7 @@ found_it: put_lock_stats(stats); lock->cpu = cpu; + lock->ip = ip; } void lock_contended(struct lockdep_map *lock, unsigned long ip) @@ -3106,7 +3115,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) } EXPORT_SYMBOL_GPL(lock_contended); -void lock_acquired(struct lockdep_map *lock) +void lock_acquired(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; @@ -3119,7 +3128,7 @@ void lock_acquired(struct lockdep_map *lock) raw_local_irq_save(flags); check_flags(flags); current->lockdep_recursion = 1; - __lock_acquired(lock); + __lock_acquired(lock, ip); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 8d3a6eba8d5a..13716b813896 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -557,7 +557,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) if (stats->read_holdtime.nr) namelen += 2; - for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { + for (i = 0; i < LOCKSTAT_POINTS; i++) { char sym[KSYM_SYMBOL_LEN]; char ip[32]; @@ -574,6 +574,23 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) stats->contention_point[i], ip, sym); } + for (i = 0; i < LOCKSTAT_POINTS; i++) { + char sym[KSYM_SYMBOL_LEN]; + char ip[32]; + + if (class->contending_point[i] == 0) + break; + + if (!i) + seq_line(m, '-', 40-namelen, namelen); + + sprint_symbol(sym, class->contending_point[i]); + snprintf(ip, sizeof(ip), "[<%p>]", + (void *)class->contending_point[i]); + seq_printf(m, "%40s %14lu %29s %s\n", name, + stats->contending_point[i], + ip, sym); + } if (i) { seq_puts(m, "\n"); seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1)); @@ -583,7 +600,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) static void seq_header(struct seq_file *m) { - seq_printf(m, "lock_stat version 0.2\n"); + seq_printf(m, "lock_stat version 0.3\n"); seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " "%14s %14s\n", diff --git a/kernel/mutex.c b/kernel/mutex.c index 12c779dc65d4..39a3816b68d9 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -184,7 +184,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } done: - lock_acquired(&lock->dep_map); + lock_acquired(&lock->dep_map, ip); /* got the lock - rejoice! */ mutex_remove_waiter(lock, &waiter, task_thread_info(task)); debug_mutex_set_owner(lock, task_thread_info(task)); -- cgit v1.2.3 From 5c653fd2257bb736e8bd0ff8acf6dac6081744e2 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 14 Oct 2008 17:50:40 -0700 Subject: xen: don't reload cr3 on suspend It isn't necessary, and it makes the code needlessly non-portable. Signed-off-by: Jeremy Fitzhardinge Cc: Isaku Yamahata Signed-off-by: Ingo Molnar --- drivers/xen/manage.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index d0e87cbe157c..9b91617b9582 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -39,8 +39,6 @@ static int xen_suspend(void *data) BUG_ON(!irqs_disabled()); - load_cr3(swapper_pg_dir); - err = device_power_down(PMSG_SUSPEND); if (err) { printk(KERN_ERR "xen_suspend: device_power_down failed: %d\n", -- cgit v1.2.3 From 85192bdb6221a5b8ba457f6e3ce4d9729220efb0 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Tue, 14 Oct 2008 17:50:41 -0700 Subject: xen: portability clean up and some minor clean up for xencomm.c clean up of xencomm.c. is_phys_contiguous() is arch dependent function that depends on how virtual memory are laid out. So split out the function into arch specific code. Signed-off-by: Isaku Yamahata Signed-off-by: Jeremy Fitzhardinge Cc: Isaku Yamahata Signed-off-by: Ingo Molnar Cc: "Luck, Tony" --- drivers/xen/xencomm.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/drivers/xen/xencomm.c b/drivers/xen/xencomm.c index 797cb4e31f07..a240b2c20b99 100644 --- a/drivers/xen/xencomm.c +++ b/drivers/xen/xencomm.c @@ -23,13 +23,7 @@ #include #include #include -#ifdef __ia64__ -#include /* for is_kern_addr() */ -#endif - -#ifdef HAVE_XEN_PLATFORM_COMPAT_H -#include -#endif +#include /* for xencomm_is_phys_contiguous() */ static int xencomm_init(struct xencomm_desc *desc, void *buffer, unsigned long bytes) @@ -157,20 +151,11 @@ static int xencomm_create(void *buffer, unsigned long bytes, return 0; } -/* check if memory address is within VMALLOC region */ -static int is_phys_contiguous(unsigned long addr) -{ - if (!is_kernel_addr(addr)) - return 0; - - return (addr < VMALLOC_START) || (addr >= VMALLOC_END); -} - static struct xencomm_handle *xencomm_create_inline(void *ptr) { unsigned long paddr; - BUG_ON(!is_phys_contiguous((unsigned long)ptr)); + BUG_ON(!xencomm_is_phys_contiguous((unsigned long)ptr)); paddr = (unsigned long)xencomm_pa(ptr); BUG_ON(paddr & XENCOMM_INLINE_FLAG); @@ -202,7 +187,7 @@ struct xencomm_handle *xencomm_map(void *ptr, unsigned long bytes) int rc; struct xencomm_desc *desc; - if (is_phys_contiguous((unsigned long)ptr)) + if (xencomm_is_phys_contiguous((unsigned long)ptr)) return xencomm_create_inline(ptr); rc = xencomm_create(ptr, bytes, &desc, GFP_KERNEL); @@ -219,7 +204,7 @@ struct xencomm_handle *__xencomm_map_no_alloc(void *ptr, unsigned long bytes, int rc; struct xencomm_desc *desc = NULL; - if (is_phys_contiguous((unsigned long)ptr)) + if (xencomm_is_phys_contiguous((unsigned long)ptr)) return xencomm_create_inline(ptr); rc = xencomm_create_mini(ptr, bytes, xc_desc, -- cgit v1.2.3 From a5af4eb16618dba52321fe420e2c93e815fcd762 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Tue, 14 Oct 2008 17:50:43 -0700 Subject: xen: compilation fix fo xen CPU hotplugging This patch fixes compilation error on ia64. include asm/xen/hypervisor.h instead of asm-x86/xen/hypervisor.h use xen_pv_domain() instead of is_running_on_xen() > CC drivers/xen/cpu_hotplug.o > In file included from /linux-2.6/drivers/xen/cpu_hotplug.c:5: > /linux-2.6/include/asm-x86/xen/hypervisor.h:44:22: error: asm/desc.h: No such file or directory > /linux-2.6/include/asm-x86/xen/hypervisor.h:66:1: warning: "MULTI_UVMFLAGS_INDEX" redefined > In file included from /linux-2.6/include/asm-x86/xen/hypervisor.h:52, > from /linux-2.6/drivers/xen/cpu_hotplug.c:5: > /linux-2.6/arch/ia64/include/asm/xen/hypercall.h:233:1: warning: this is the location of the previous definition > /linux-2.6/drivers/xen/cpu_hotplug.c: In function 'setup_vcpu_hotplug_event': > /linux-2.6/drivers/xen/cpu_hotplug.c:81: error: implicit declaration of function 'is_running_on_xen' > make[4]: *** [drivers/xen/cpu_hotplug.o] Error 1 > make[4]: *** Waiting for unfinished jobs.... Signed-off-by: Isaku Yamahata Signed-off-by: Jeremy Fitzhardinge Cc: Isaku Yamahata Signed-off-by: Ingo Molnar --- drivers/xen/cpu_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c index 565280ec1c6a..974f56d1ebe1 100644 --- a/drivers/xen/cpu_hotplug.c +++ b/drivers/xen/cpu_hotplug.c @@ -2,7 +2,7 @@ #include -#include +#include #include static void enable_hotplug_cpu(int cpu) -- cgit v1.2.3 From 8dd2337470d2ead6835982ed856d988f9e062140 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Tue, 14 Oct 2008 17:50:44 -0700 Subject: xen: compilation fix of drivers/xen/events.c on IA64 use set_xen_guest_handle() instead of direct assigning. > linux-2.6/drivers/xen/events.c: In function 'xen_poll_irq': > linux-2.6/drivers/xen/events.c:757: error: incompatible types in assignment > make[4]: *** [drivers/xen/events.o] Error 1 Signed-off-by: Isaku Yamahata Signed-off-by: Jeremy Fitzhardinge Cc: Isaku Yamahata Signed-off-by: Ingo Molnar --- drivers/xen/events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 9ce1ab6c268d..1e3b934a4cf7 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -774,7 +774,7 @@ void xen_poll_irq(int irq) poll.nr_ports = 1; poll.timeout = 0; - poll.ports = &evtchn; + set_xen_guest_handle(poll.ports, &evtchn); if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0) BUG(); -- cgit v1.2.3 From d98d38f2014ab79f28c126ff175d034891f7aefc Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 29 Oct 2008 14:24:09 -0700 Subject: mutex: improve header comment to be actually informative about the API Impact: improve documentation It's nice to say that mutex_trylock follows the spin_trylock convention. It's a lot nicer if the comment also says which that is... make it so. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton --- include/linux/mutex.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/mutex.h b/include/linux/mutex.h index bc6da10ceee0..7a0e5c4f8072 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -144,6 +144,8 @@ extern int __must_check mutex_lock_killable(struct mutex *lock); /* * NOTE: mutex_trylock() follows the spin_trylock() convention, * not the down_trylock() convention! + * + * Returns 1 if the mutex has been acquired successfully, and 0 on contention. */ extern int mutex_trylock(struct mutex *lock); extern void mutex_unlock(struct mutex *lock); -- cgit v1.2.3 From e25cf3db560e803292946ef23a30c69e341ce56f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 17 Oct 2008 15:55:07 +0200 Subject: lockdep: include/linux/lockdep.h - fix warning in net/bluetooth/af_bluetooth.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix this warning: net/bluetooth/af_bluetooth.c:60: warning: ‘bt_key_strings’ defined but not used net/bluetooth/af_bluetooth.c:71: warning: ‘bt_slock_key_strings’ defined but not used this is a lockdep macro problem in the !LOCKDEP case. We cannot convert it to an inline because the macro works on multiple types, but we can mark the parameter used. [ also clean up a misaligned tab in sock_lock_init_class_and_name() ] [ also remove #ifdefs from around af_family_clock_key strings - which were certainly added to get rid of the ugly build warnings. ] Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 5 +++-- include/net/sock.h | 2 +- net/core/sock.c | 2 -- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index fc9f8e88123b..8956daf64abd 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -336,10 +336,11 @@ static inline void lockdep_on(void) # define lock_set_subclass(l, s, i) do { } while (0) # define lockdep_init() do { } while (0) # define lockdep_info() do { } while (0) -# define lockdep_init_map(lock, name, key, sub) do { (void)(key); } while (0) +# define lockdep_init_map(lock, name, key, sub) \ + do { (void)(name); (void)(key); } while (0) # define lockdep_set_class(lock, key) do { (void)(key); } while (0) # define lockdep_set_class_and_name(lock, key, name) \ - do { (void)(key); } while (0) + do { (void)(key); (void)(name); } while (0) #define lockdep_set_class_and_subclass(lock, key, sub) \ do { (void)(key); } while (0) #define lockdep_set_subclass(lock, sub) do { } while (0) diff --git a/include/net/sock.h b/include/net/sock.h index c04f9e18ea22..2f47107f6d0f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -815,7 +815,7 @@ static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) */ #define sock_lock_init_class_and_name(sk, sname, skey, name, key) \ do { \ - sk->sk_lock.owned = 0; \ + sk->sk_lock.owned = 0; \ init_waitqueue_head(&sk->sk_lock.wq); \ spin_lock_init(&(sk)->sk_lock.slock); \ debug_check_no_locks_freed((void *)&(sk)->sk_lock, \ diff --git a/net/core/sock.c b/net/core/sock.c index 5e2a3132a8c9..341e39456952 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -136,7 +136,6 @@ static struct lock_class_key af_family_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; -#ifdef CONFIG_DEBUG_LOCK_ALLOC /* * Make lock validator output more readable. (we pre-construct these * strings build-time, so that runtime initialization of socket @@ -187,7 +186,6 @@ static const char *af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , "clock-AF_MAX" }; -#endif /* * sk_callback_lock locking rules are per-address-family, -- cgit v1.2.3 From 74fcd524e808975dd546dac847119f1995a7c622 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 17 Nov 2008 15:39:52 +0100 Subject: account_steal_time: kill the unneeded account_group_system_time() Impact: remove unnecessary accounting call I don't actually understand account_steal_time() and I failed to find the commit which added account_group_system_time(), but this looks bogus. In any case rq->idle must be single-threaded, so it can't have ->totals. Signed-off-by: Oleg Nesterov Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index c94baf2969e7..b388c9b243e9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4202,7 +4202,6 @@ void account_steal_time(struct task_struct *p, cputime_t steal) if (p == rq->idle) { p->stime = cputime_add(p->stime, steal); - account_group_system_time(p, steal); if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait = cputime64_add(cpustat->iowait, tmp); else -- cgit v1.2.3 From ce394471d13bf071939a9a0b48c64c297676d233 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 17 Nov 2008 15:40:01 +0100 Subject: thread_group_cputime: kill the bogus ->signal != NULL check Impact: simplify the code thread_group_cputime() is called by current when it must have the valid ->signal, or under ->siglock, or under tasklist_lock after the ->signal check, or the caller is wait_task_zombie() which reaps the child. In any case ->signal can't be NULL. But the point of this patch is not optimization. If it is possible to call thread_group_cputime() when ->signal == NULL we are doing something wrong, and we should not mask the problem. thread_group_cputime() fills *times and the caller will use it, if we silently use task_struct->*times* we report the wrong values. Signed-off-by: Oleg Nesterov Signed-off-by: Ingo Molnar --- kernel/posix-cpu-timers.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 895337b16a24..3f4377e0aa04 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -58,21 +58,21 @@ void thread_group_cputime( struct task_struct *tsk, struct task_cputime *times) { - struct signal_struct *sig; + struct task_cputime *totals, *tot; int i; - struct task_cputime *tot; - sig = tsk->signal; - if (unlikely(!sig) || !sig->cputime.totals) { + totals = tsk->signal->cputime.totals; + if (!totals) { times->utime = tsk->utime; times->stime = tsk->stime; times->sum_exec_runtime = tsk->se.sum_exec_runtime; return; } + times->stime = times->utime = cputime_zero; times->sum_exec_runtime = 0; for_each_possible_cpu(i) { - tot = per_cpu_ptr(tsk->signal->cputime.totals, i); + tot = per_cpu_ptr(totals, i); times->utime = cputime_add(times->utime, tot->utime); times->stime = cputime_add(times->stime, tot->stime); times->sum_exec_runtime += tot->sum_exec_runtime; -- cgit v1.2.3 From 2b5fe6de58276d0b5a7c884d5dbfc300ca47db78 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 17 Nov 2008 15:40:08 +0100 Subject: thread_group_cputime: move a couple of callsites outside of ->siglock Impact: relax the locking of cpu-time accounting calls ->siglock buys nothing for thread_group_cputime() in do_sys_times() and wait_task_zombie() (which btw takes the unrelated parent's ->siglock). Actually I think do_sys_times() doesn't need ->siglock at all. Signed-off-by: Oleg Nesterov Signed-off-by: Ingo Molnar --- kernel/exit.c | 2 +- kernel/sys.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index ae2b92be5fae..b9c4d8bb72e5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1330,10 +1330,10 @@ static int wait_task_zombie(struct task_struct *p, int options, * group, which consolidates times for all threads in the * group including the group leader. */ + thread_group_cputime(p, &cputime); spin_lock_irq(&p->parent->sighand->siglock); psig = p->parent->signal; sig = p->signal; - thread_group_cputime(p, &cputime); psig->cutime = cputime_add(psig->cutime, cputime_add(cputime.utime, diff --git a/kernel/sys.c b/kernel/sys.c index 31deba8f7d16..5fc3a0cfb994 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -858,8 +858,8 @@ void do_sys_times(struct tms *tms) struct task_cputime cputime; cputime_t cutime, cstime; - spin_lock_irq(¤t->sighand->siglock); thread_group_cputime(current, &cputime); + spin_lock_irq(¤t->sighand->siglock); cutime = current->signal->cutime; cstime = current->signal->cstime; spin_unlock_irq(¤t->sighand->siglock); -- cgit v1.2.3