From 86397dc3ccfc0e17b7550d05eaf15fe91f6498dd Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 17 Aug 2010 13:53:06 +0800 Subject: tracing: Clean up seqfile code for format file Remove the nasty hack that marks a pointer's LSB to distinguish common fields from event fields. Replace it with a more sane approach. Signed-off-by: Li Zefan LKML-Reference: <4C6A23C2.9020606@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 55 +++++++++++++++------------------------------ 1 file changed, 18 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 4c758f146328..398c0e8b332c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -600,21 +600,29 @@ out: enum { FORMAT_HEADER = 1, - FORMAT_PRINTFMT = 2, + FORMAT_FIELD_SEPERATOR = 2, + FORMAT_PRINTFMT = 3, }; static void *f_next(struct seq_file *m, void *v, loff_t *pos) { struct ftrace_event_call *call = m->private; struct ftrace_event_field *field; - struct list_head *head; + struct list_head *common_head = &ftrace_common_fields; + struct list_head *head = trace_get_fields(call); (*pos)++; switch ((unsigned long)v) { case FORMAT_HEADER: - head = &ftrace_common_fields; + if (unlikely(list_empty(common_head))) + return NULL; + + field = list_entry(common_head->prev, + struct ftrace_event_field, link); + return field; + case FORMAT_FIELD_SEPERATOR: if (unlikely(list_empty(head))) return NULL; @@ -626,31 +634,10 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos) return NULL; } - head = trace_get_fields(call); - - /* - * To separate common fields from event fields, the - * LSB is set on the first event field. Clear it in case. - */ - v = (void *)((unsigned long)v & ~1L); - field = v; - /* - * If this is a common field, and at the end of the list, then - * continue with main list. - */ - if (field->link.prev == &ftrace_common_fields) { - if (unlikely(list_empty(head))) - return NULL; - field = list_entry(head->prev, struct ftrace_event_field, link); - /* Set the LSB to notify f_show to print an extra newline */ - field = (struct ftrace_event_field *) - ((unsigned long)field | 1); - return field; - } - - /* If we are done tell f_show to print the format */ - if (field->link.prev == head) + if (field->link.prev == common_head) + return (void *)FORMAT_FIELD_SEPERATOR; + else if (field->link.prev == head) return (void *)FORMAT_PRINTFMT; field = list_entry(field->link.prev, struct ftrace_event_field, link); @@ -688,22 +675,16 @@ static int f_show(struct seq_file *m, void *v) seq_printf(m, "format:\n"); return 0; + case FORMAT_FIELD_SEPERATOR: + seq_putc(m, '\n'); + return 0; + case FORMAT_PRINTFMT: seq_printf(m, "\nprint fmt: %s\n", call->print_fmt); return 0; } - /* - * To separate common fields from event fields, the - * LSB is set on the first event field. Clear it and - * print a newline if it is set. - */ - if ((unsigned long)v & 1) { - seq_putc(m, '\n'); - v = (void *)((unsigned long)v & ~1L); - } - field = v; /* -- cgit v1.2.3 From 56962b4449af34070bb1994621ef4f0265eed4d8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 30 Jun 2010 23:03:51 +0200 Subject: perf: Generalize some arch callchain code - Most archs use one callchain buffer per cpu, except x86 that needs to deal with NMIs. Provide a default perf_callchain_buffer() implementation that x86 overrides. - Centralize all the kernel/user regs handling and invoke new arch handlers from there: perf_callchain_user() / perf_callchain_kernel() That avoid all the user_mode(), current->mm checks and so... - Invert some parameters in perf_callchain_*() helpers: entry to the left, regs to the right, following the traditional (dst, src). Signed-off-by: Frederic Weisbecker Acked-by: Paul Mackerras Tested-by: Will Deacon Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian Cc: David Miller Cc: Paul Mundt Cc: Borislav Petkov --- arch/arm/kernel/perf_event.c | 43 +++---------------------------- arch/powerpc/kernel/perf_callchain.c | 49 +++++++++++------------------------- arch/sh/kernel/perf_callchain.c | 37 ++------------------------- arch/sparc/kernel/perf_event.c | 46 +++++++++++---------------------- arch/x86/kernel/cpu/perf_event.c | 45 ++++++--------------------------- include/linux/perf_event.h | 10 +++++++- kernel/perf_event.c | 40 +++++++++++++++++++++++++++-- 7 files changed, 90 insertions(+), 180 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c index a07c3b1955f0..0e3bbdb15927 100644 --- a/arch/arm/kernel/perf_event.c +++ b/arch/arm/kernel/perf_event.c @@ -3044,17 +3044,13 @@ user_backtrace(struct frame_tail *tail, return buftail.fp - 1; } -static void -perf_callchain_user(struct pt_regs *regs, - struct perf_callchain_entry *entry) +void +perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) { struct frame_tail *tail; perf_callchain_store(entry, PERF_CONTEXT_USER); - if (!user_mode(regs)) - regs = task_pt_regs(current); - tail = (struct frame_tail *)regs->ARM_fp - 1; while (tail && !((unsigned long)tail & 0x3)) @@ -3075,9 +3071,8 @@ callchain_trace(struct stackframe *fr, return 0; } -static void -perf_callchain_kernel(struct pt_regs *regs, - struct perf_callchain_entry *entry) +void +perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) { struct stackframe fr; @@ -3088,33 +3083,3 @@ perf_callchain_kernel(struct pt_regs *regs, fr.pc = regs->ARM_pc; walk_stackframe(&fr, callchain_trace, entry); } - -static void -perf_do_callchain(struct pt_regs *regs, - struct perf_callchain_entry *entry) -{ - int is_user; - - if (!regs) - return; - - is_user = user_mode(regs); - - if (!is_user) - perf_callchain_kernel(regs, entry); - - if (current->mm) - perf_callchain_user(regs, entry); -} - -static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); - -struct perf_callchain_entry * -perf_callchain(struct pt_regs *regs) -{ - struct perf_callchain_entry *entry = &__get_cpu_var(pmc_irq_entry); - - entry->nr = 0; - perf_do_callchain(regs, entry); - return entry; -} diff --git a/arch/powerpc/kernel/perf_callchain.c b/arch/powerpc/kernel/perf_callchain.c index a286c2e5a3ea..f7a85ede8407 100644 --- a/arch/powerpc/kernel/perf_callchain.c +++ b/arch/powerpc/kernel/perf_callchain.c @@ -46,8 +46,8 @@ static int valid_next_sp(unsigned long sp, unsigned long prev_sp) return 0; } -static void perf_callchain_kernel(struct pt_regs *regs, - struct perf_callchain_entry *entry) +void +perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) { unsigned long sp, next_sp; unsigned long next_ip; @@ -221,8 +221,8 @@ static int sane_signal_64_frame(unsigned long sp) puc == (unsigned long) &sf->uc; } -static void perf_callchain_user_64(struct pt_regs *regs, - struct perf_callchain_entry *entry) +static void perf_callchain_user_64(struct perf_callchain_entry *entry, + struct pt_regs *regs) { unsigned long sp, next_sp; unsigned long next_ip; @@ -303,8 +303,8 @@ static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret) return __get_user_inatomic(*ret, ptr); } -static inline void perf_callchain_user_64(struct pt_regs *regs, - struct perf_callchain_entry *entry) +static inline void perf_callchain_user_64(struct perf_callchain_entry *entry, + struct pt_regs *regs) { } @@ -423,8 +423,8 @@ static unsigned int __user *signal_frame_32_regs(unsigned int sp, return mctx->mc_gregs; } -static void perf_callchain_user_32(struct pt_regs *regs, - struct perf_callchain_entry *entry) +static void perf_callchain_user_32(struct perf_callchain_entry *entry, + struct pt_regs *regs) { unsigned int sp, next_sp; unsigned int next_ip; @@ -471,32 +471,11 @@ static void perf_callchain_user_32(struct pt_regs *regs, } } -/* - * Since we can't get PMU interrupts inside a PMU interrupt handler, - * we don't need separate irq and nmi entries here. - */ -static DEFINE_PER_CPU(struct perf_callchain_entry, cpu_perf_callchain); - -struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +void +perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) { - struct perf_callchain_entry *entry = &__get_cpu_var(cpu_perf_callchain); - - entry->nr = 0; - - if (!user_mode(regs)) { - perf_callchain_kernel(regs, entry); - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; - } - - if (regs) { - if (current_is_64bit()) - perf_callchain_user_64(regs, entry); - else - perf_callchain_user_32(regs, entry); - } - - return entry; + if (current_is_64bit()) + perf_callchain_user_64(entry, regs); + else + perf_callchain_user_32(entry, regs); } diff --git a/arch/sh/kernel/perf_callchain.c b/arch/sh/kernel/perf_callchain.c index 00143f3dd196..ef076a91292a 100644 --- a/arch/sh/kernel/perf_callchain.c +++ b/arch/sh/kernel/perf_callchain.c @@ -44,44 +44,11 @@ static const struct stacktrace_ops callchain_ops = { .address = callchain_address, }; -static void -perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) +void +perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) { perf_callchain_store(entry, PERF_CONTEXT_KERNEL); perf_callchain_store(entry, regs->pc); unwind_stack(NULL, regs, NULL, &callchain_ops, entry); } - -static void -perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) -{ - int is_user; - - if (!regs) - return; - - is_user = user_mode(regs); - - /* - * Only the kernel side is implemented for now. - */ - if (!is_user) - perf_callchain_kernel(regs, entry); -} - -/* - * No need for separate IRQ and NMI entries. - */ -static DEFINE_PER_CPU(struct perf_callchain_entry, callchain); - -struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - struct perf_callchain_entry *entry = &__get_cpu_var(callchain); - - entry->nr = 0; - - perf_do_callchain(regs, entry); - - return entry; -} diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 2a95a9079862..460162d74aba 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -1283,14 +1283,16 @@ void __init init_hw_perf_events(void) register_die_notifier(&perf_event_nmi_notifier); } -static void perf_callchain_kernel(struct pt_regs *regs, - struct perf_callchain_entry *entry) +void perf_callchain_kernel(struct perf_callchain_entry *entry, + struct pt_regs *regs) { unsigned long ksp, fp; #ifdef CONFIG_FUNCTION_GRAPH_TRACER int graph = 0; #endif + stack_trace_flush(); + perf_callchain_store(entry, PERF_CONTEXT_KERNEL); perf_callchain_store(entry, regs->tpc); @@ -1330,8 +1332,8 @@ static void perf_callchain_kernel(struct pt_regs *regs, } while (entry->nr < PERF_MAX_STACK_DEPTH); } -static void perf_callchain_user_64(struct pt_regs *regs, - struct perf_callchain_entry *entry) +static void perf_callchain_user_64(struct perf_callchain_entry *entry, + struct pt_regs *regs) { unsigned long ufp; @@ -1353,8 +1355,8 @@ static void perf_callchain_user_64(struct pt_regs *regs, } while (entry->nr < PERF_MAX_STACK_DEPTH); } -static void perf_callchain_user_32(struct pt_regs *regs, - struct perf_callchain_entry *entry) +static void perf_callchain_user_32(struct perf_callchain_entry *entry, + struct pt_regs *regs) { unsigned long ufp; @@ -1376,30 +1378,12 @@ static void perf_callchain_user_32(struct pt_regs *regs, } while (entry->nr < PERF_MAX_STACK_DEPTH); } -/* Like powerpc we can't get PMU interrupts within the PMU handler, - * so no need for separate NMI and IRQ chains as on x86. - */ -static DEFINE_PER_CPU(struct perf_callchain_entry, callchain); - -struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +void +perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) { - struct perf_callchain_entry *entry = &__get_cpu_var(callchain); - - entry->nr = 0; - if (!user_mode(regs)) { - stack_trace_flush(); - perf_callchain_kernel(regs, entry); - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; - } - if (regs) { - flushw_user(); - if (test_thread_flag(TIF_32BIT)) - perf_callchain_user_32(regs, entry); - else - perf_callchain_user_64(regs, entry); - } - return entry; + flushw_user(); + if (test_thread_flag(TIF_32BIT)) + perf_callchain_user_32(entry, regs); + else + perf_callchain_user_64(entry, regs); } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 8af28caeafc1..39f8421b86e6 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1571,9 +1571,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event) * callchain support */ - -static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); -static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry_nmi); static void @@ -1607,8 +1605,8 @@ static const struct stacktrace_ops backtrace_ops = { .walk_stack = print_context_stack_bp, }; -static void -perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) +void +perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) { perf_callchain_store(entry, PERF_CONTEXT_KERNEL); perf_callchain_store(entry, regs->ip); @@ -1653,14 +1651,12 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) } #endif -static void -perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) +void +perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) { struct stack_frame frame; const void __user *fp; - if (!user_mode(regs)) - regs = task_pt_regs(current); fp = (void __user *)regs->bp; @@ -1687,42 +1683,17 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) } } -static void -perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) -{ - int is_user; - - if (!regs) - return; - - is_user = user_mode(regs); - - if (!is_user) - perf_callchain_kernel(regs, entry); - - if (current->mm) - perf_callchain_user(regs, entry); -} - -struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +struct perf_callchain_entry *perf_callchain_buffer(void) { - struct perf_callchain_entry *entry; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { /* TODO: We don't support guest os callchain now */ return NULL; } if (in_nmi()) - entry = &__get_cpu_var(pmc_nmi_entry); - else - entry = &__get_cpu_var(pmc_irq_entry); - - entry->nr = 0; - - perf_do_callchain(regs, entry); + return &__get_cpu_var(perf_callchain_entry_nmi); - return entry; + return &__get_cpu_var(perf_callchain_entry); } unsigned long perf_instruction_pointer(struct pt_regs *regs) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 358880404b42..4db61dded388 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -976,7 +976,15 @@ extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks extern void perf_event_comm(struct task_struct *tsk); extern void perf_event_fork(struct task_struct *tsk); -extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); +/* Callchains */ +DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); + +extern void perf_callchain_user(struct perf_callchain_entry *entry, + struct pt_regs *regs); +extern void perf_callchain_kernel(struct perf_callchain_entry *entry, + struct pt_regs *regs); +extern struct perf_callchain_entry *perf_callchain_buffer(void); + static inline void perf_callchain_store(struct perf_callchain_entry *entry, u64 ip) diff --git a/kernel/perf_event.c b/kernel/perf_event.c index c772a3d4000d..02efde6c8798 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2937,13 +2937,49 @@ void perf_event_do_pending(void) __perf_pending_run(); } +DEFINE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); + /* * Callchain support -- arch specific */ -__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +__weak struct perf_callchain_entry *perf_callchain_buffer(void) { - return NULL; + return &__get_cpu_var(perf_callchain_entry); +} + +__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +__weak void perf_callchain_user(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + struct perf_callchain_entry *entry; + + entry = perf_callchain_buffer(); + if (!entry) + return NULL; + + entry->nr = 0; + + if (!user_mode(regs)) { + perf_callchain_kernel(entry, regs); + if (current->mm) + regs = task_pt_regs(current); + else + regs = NULL; + } + + if (regs) + perf_callchain_user(entry, regs); + + return entry; } -- cgit v1.2.3 From f72c1a931e311bb7780fee19e41a89ac42cab50e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 1 Jul 2010 02:31:21 +0200 Subject: perf: Factorize callchain context handling Store the kernel and user contexts from the generic layer instead of archs, this gathers some repetitive code. Signed-off-by: Frederic Weisbecker Acked-by: Paul Mackerras Tested-by: Will Deacon Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian Cc: David Miller Cc: Paul Mundt Cc: Borislav Petkov --- arch/arm/kernel/perf_event.c | 2 -- arch/powerpc/kernel/perf_callchain.c | 3 --- arch/sh/kernel/perf_callchain.c | 1 - arch/sparc/kernel/perf_event.c | 3 --- arch/x86/kernel/cpu/perf_event.c | 2 -- kernel/perf_event.c | 5 ++++- 6 files changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c index 0e3bbdb15927..64ca8c3ab94b 100644 --- a/arch/arm/kernel/perf_event.c +++ b/arch/arm/kernel/perf_event.c @@ -3049,7 +3049,6 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) { struct frame_tail *tail; - perf_callchain_store(entry, PERF_CONTEXT_USER); tail = (struct frame_tail *)regs->ARM_fp - 1; @@ -3076,7 +3075,6 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) { struct stackframe fr; - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); fr.fp = regs->ARM_fp; fr.sp = regs->ARM_sp; fr.lr = regs->ARM_lr; diff --git a/arch/powerpc/kernel/perf_callchain.c b/arch/powerpc/kernel/perf_callchain.c index f7a85ede8407..d05ae4204bbf 100644 --- a/arch/powerpc/kernel/perf_callchain.c +++ b/arch/powerpc/kernel/perf_callchain.c @@ -57,7 +57,6 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) lr = regs->link; sp = regs->gpr[1]; - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); perf_callchain_store(entry, regs->nip); if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD)) @@ -234,7 +233,6 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry, next_ip = regs->nip; lr = regs->link; sp = regs->gpr[1]; - perf_callchain_store(entry, PERF_CONTEXT_USER); perf_callchain_store(entry, next_ip); for (;;) { @@ -435,7 +433,6 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry, next_ip = regs->nip; lr = regs->link; sp = regs->gpr[1]; - perf_callchain_store(entry, PERF_CONTEXT_USER); perf_callchain_store(entry, next_ip); while (entry->nr < PERF_MAX_STACK_DEPTH) { diff --git a/arch/sh/kernel/perf_callchain.c b/arch/sh/kernel/perf_callchain.c index ef076a91292a..d5ca1ef50fa9 100644 --- a/arch/sh/kernel/perf_callchain.c +++ b/arch/sh/kernel/perf_callchain.c @@ -47,7 +47,6 @@ static const struct stacktrace_ops callchain_ops = { void perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) { - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); perf_callchain_store(entry, regs->pc); unwind_stack(NULL, regs, NULL, &callchain_ops, entry); diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 460162d74aba..4bc402938575 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -1293,7 +1293,6 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry, stack_trace_flush(); - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); perf_callchain_store(entry, regs->tpc); ksp = regs->u_regs[UREG_I6]; @@ -1337,7 +1336,6 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry, { unsigned long ufp; - perf_callchain_store(entry, PERF_CONTEXT_USER); perf_callchain_store(entry, regs->tpc); ufp = regs->u_regs[UREG_I6] + STACK_BIAS; @@ -1360,7 +1358,6 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry, { unsigned long ufp; - perf_callchain_store(entry, PERF_CONTEXT_USER); perf_callchain_store(entry, regs->tpc); ufp = regs->u_regs[UREG_I6] & 0xffffffffUL; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 39f8421b86e6..a3c922288cc0 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1608,7 +1608,6 @@ static const struct stacktrace_ops backtrace_ops = { void perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) { - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); perf_callchain_store(entry, regs->ip); dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); @@ -1660,7 +1659,6 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) fp = (void __user *)regs->bp; - perf_callchain_store(entry, PERF_CONTEXT_USER); perf_callchain_store(entry, regs->ip); if (perf_callchain_user32(regs, entry)) diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 02efde6c8798..615d024894cf 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2969,6 +2969,7 @@ static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) entry->nr = 0; if (!user_mode(regs)) { + perf_callchain_store(entry, PERF_CONTEXT_KERNEL); perf_callchain_kernel(entry, regs); if (current->mm) regs = task_pt_regs(current); @@ -2976,8 +2977,10 @@ static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) regs = NULL; } - if (regs) + if (regs) { + perf_callchain_store(entry, PERF_CONTEXT_USER); perf_callchain_user(entry, regs); + } return entry; } -- cgit v1.2.3 From 927c7a9e92c4f69097a6e9e086d11fc2f8a5b40b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 1 Jul 2010 16:20:36 +0200 Subject: perf: Fix race in callchains Now that software events don't have interrupt disabled anymore in the event path, callchains can nest on any context. So seperating nmi and others contexts in two buffers has become racy. Fix this by providing one buffer per nesting level. Given the size of the callchain entries (2040 bytes * 4), we now need to allocate them dynamically. v2: Fixed put_callchain_entry call after recursion. Fix the type of the recursion, it must be an array. v3: Use a manual pr cpu allocation (temporary solution until NMIs can safely access vmalloc'ed memory). Do a better separation between callchain reference tracking and allocation. Make the "put" path lockless for non-release cases. v4: Protect the callchain buffers with rcu. v5: Do the cpu buffers allocations node affine. Signed-off-by: Frederic Weisbecker Tested-by: Will Deacon Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Stephane Eranian Cc: Paul Mundt Cc: David Miller Cc: Borislav Petkov --- arch/x86/kernel/cpu/perf_event.c | 22 ++- include/linux/perf_event.h | 1 - kernel/perf_event.c | 298 ++++++++++++++++++++++++++++++--------- 3 files changed, 238 insertions(+), 83 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index a3c922288cc0..8e91cf34a9c8 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1608,6 +1608,11 @@ static const struct stacktrace_ops backtrace_ops = { void perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) { + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + /* TODO: We don't support guest os callchain now */ + return NULL; + } + perf_callchain_store(entry, regs->ip); dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); @@ -1656,6 +1661,10 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) struct stack_frame frame; const void __user *fp; + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + /* TODO: We don't support guest os callchain now */ + return NULL; + } fp = (void __user *)regs->bp; @@ -1681,19 +1690,6 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) } } -struct perf_callchain_entry *perf_callchain_buffer(void) -{ - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { - /* TODO: We don't support guest os callchain now */ - return NULL; - } - - if (in_nmi()) - return &__get_cpu_var(perf_callchain_entry_nmi); - - return &__get_cpu_var(perf_callchain_entry); -} - unsigned long perf_instruction_pointer(struct pt_regs *regs) { unsigned long ip; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 4db61dded388..d7e8ea690864 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -983,7 +983,6 @@ extern void perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs); extern void perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs); -extern struct perf_callchain_entry *perf_callchain_buffer(void); static inline void diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 615d024894cf..75ab8a2df6b2 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1763,6 +1763,216 @@ static u64 perf_event_read(struct perf_event *event) return perf_event_count(event); } +/* + * Callchain support + */ + +struct callchain_cpus_entries { + struct rcu_head rcu_head; + struct perf_callchain_entry *cpu_entries[0]; +}; + +static DEFINE_PER_CPU(int, callchain_recursion[4]); +static atomic_t nr_callchain_events; +static DEFINE_MUTEX(callchain_mutex); +struct callchain_cpus_entries *callchain_cpus_entries; + + +__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +__weak void perf_callchain_user(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +static void release_callchain_buffers_rcu(struct rcu_head *head) +{ + struct callchain_cpus_entries *entries; + int cpu; + + entries = container_of(head, struct callchain_cpus_entries, rcu_head); + + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + + kfree(entries); +} + +static void release_callchain_buffers(void) +{ + struct callchain_cpus_entries *entries; + + entries = callchain_cpus_entries; + rcu_assign_pointer(callchain_cpus_entries, NULL); + call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); +} + +static int alloc_callchain_buffers(void) +{ + int cpu; + int size; + struct callchain_cpus_entries *entries; + + /* + * We can't use the percpu allocation API for data that can be + * accessed from NMI. Use a temporary manual per cpu allocation + * until that gets sorted out. + */ + size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) * + num_possible_cpus(); + + entries = kzalloc(size, GFP_KERNEL); + if (!entries) + return -ENOMEM; + + size = sizeof(struct perf_callchain_entry) * 4; + + for_each_possible_cpu(cpu) { + entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, + cpu_to_node(cpu)); + if (!entries->cpu_entries[cpu]) + goto fail; + } + + rcu_assign_pointer(callchain_cpus_entries, entries); + + return 0; + +fail: + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + kfree(entries); + + return -ENOMEM; +} + +static int get_callchain_buffers(void) +{ + int err = 0; + int count; + + mutex_lock(&callchain_mutex); + + count = atomic_inc_return(&nr_callchain_events); + if (WARN_ON_ONCE(count < 1)) { + err = -EINVAL; + goto exit; + } + + if (count > 1) { + /* If the allocation failed, give up */ + if (!callchain_cpus_entries) + err = -ENOMEM; + goto exit; + } + + err = alloc_callchain_buffers(); + if (err) + release_callchain_buffers(); +exit: + mutex_unlock(&callchain_mutex); + + return err; +} + +static void put_callchain_buffers(void) +{ + if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { + release_callchain_buffers(); + mutex_unlock(&callchain_mutex); + } +} + +static int get_recursion_context(int *recursion) +{ + int rctx; + + if (in_nmi()) + rctx = 3; + else if (in_irq()) + rctx = 2; + else if (in_softirq()) + rctx = 1; + else + rctx = 0; + + if (recursion[rctx]) + return -1; + + recursion[rctx]++; + barrier(); + + return rctx; +} + +static inline void put_recursion_context(int *recursion, int rctx) +{ + barrier(); + recursion[rctx]--; +} + +static struct perf_callchain_entry *get_callchain_entry(int *rctx) +{ + int cpu; + struct callchain_cpus_entries *entries; + + *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); + if (*rctx == -1) + return NULL; + + entries = rcu_dereference(callchain_cpus_entries); + if (!entries) + return NULL; + + cpu = smp_processor_id(); + + return &entries->cpu_entries[cpu][*rctx]; +} + +static void +put_callchain_entry(int rctx) +{ + put_recursion_context(__get_cpu_var(callchain_recursion), rctx); +} + +static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + int rctx; + struct perf_callchain_entry *entry; + + + entry = get_callchain_entry(&rctx); + if (rctx == -1) + return NULL; + + if (!entry) + goto exit_put; + + entry->nr = 0; + + if (!user_mode(regs)) { + perf_callchain_store(entry, PERF_CONTEXT_KERNEL); + perf_callchain_kernel(entry, regs); + if (current->mm) + regs = task_pt_regs(current); + else + regs = NULL; + } + + if (regs) { + perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_user(entry, regs); + } + +exit_put: + put_callchain_entry(rctx); + + return entry; +} + /* * Initialize the perf_event context in a task_struct: */ @@ -1895,6 +2105,8 @@ static void free_event(struct perf_event *event) atomic_dec(&nr_comm_events); if (event->attr.task) atomic_dec(&nr_task_events); + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + put_callchain_buffers(); } if (event->buffer) { @@ -2937,55 +3149,6 @@ void perf_event_do_pending(void) __perf_pending_run(); } -DEFINE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); - -/* - * Callchain support -- arch specific - */ - -__weak struct perf_callchain_entry *perf_callchain_buffer(void) -{ - return &__get_cpu_var(perf_callchain_entry); -} - -__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, - struct pt_regs *regs) -{ -} - -__weak void perf_callchain_user(struct perf_callchain_entry *entry, - struct pt_regs *regs) -{ -} - -static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - struct perf_callchain_entry *entry; - - entry = perf_callchain_buffer(); - if (!entry) - return NULL; - - entry->nr = 0; - - if (!user_mode(regs)) { - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); - perf_callchain_kernel(entry, regs); - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; - } - - if (regs) { - perf_callchain_store(entry, PERF_CONTEXT_USER); - perf_callchain_user(entry, regs); - } - - return entry; -} - - /* * We assume there is only KVM supporting the callbacks. * Later on, we might change it to a list if there is @@ -3480,14 +3643,20 @@ static void perf_event_output(struct perf_event *event, int nmi, struct perf_output_handle handle; struct perf_event_header header; + /* protect the callchain buffers */ + rcu_read_lock(); + perf_prepare_sample(&header, data, event, regs); if (perf_output_begin(&handle, event, header.size, nmi, 1)) - return; + goto exit; perf_output_sample(&handle, &header, data, event); perf_output_end(&handle); + +exit: + rcu_read_unlock(); } /* @@ -4243,32 +4412,16 @@ end: int perf_swevent_get_recursion_context(void) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - int rctx; - if (in_nmi()) - rctx = 3; - else if (in_irq()) - rctx = 2; - else if (in_softirq()) - rctx = 1; - else - rctx = 0; - - if (cpuctx->recursion[rctx]) - return -1; - - cpuctx->recursion[rctx]++; - barrier(); - - return rctx; + return get_recursion_context(cpuctx->recursion); } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); void inline perf_swevent_put_recursion_context(int rctx) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - barrier(); - cpuctx->recursion[rctx]--; + + put_recursion_context(cpuctx->recursion, rctx); } void __perf_sw_event(u32 event_id, u64 nr, int nmi, @@ -4968,6 +5121,13 @@ done: atomic_inc(&nr_comm_events); if (event->attr.task) atomic_inc(&nr_task_events); + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { + err = get_callchain_buffers(); + if (err) { + free_event(event); + return ERR_PTR(err); + } + } } return event; -- cgit v1.2.3 From 7ae07ea3a48d30689ee037cb136bc21f0b37d8ae Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 14 Aug 2010 20:45:13 +0200 Subject: perf: Humanize the number of contexts Instead of hardcoding the number of contexts for the recursions barriers, define a cpp constant to make the code more self-explanatory. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Stephane Eranian --- include/linux/perf_event.h | 14 ++++++++------ kernel/perf_event.c | 4 ++-- kernel/trace/trace_event_perf.c | 8 ++++---- 3 files changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index d7e8ea690864..ae6fa6050925 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -808,6 +808,12 @@ struct perf_event_context { struct rcu_head rcu_head; }; +/* + * Number of contexts where an event can trigger: + * task, softirq, hardirq, nmi. + */ +#define PERF_NR_CONTEXTS 4 + /** * struct perf_event_cpu_context - per cpu event context structure */ @@ -821,12 +827,8 @@ struct perf_cpu_context { struct mutex hlist_mutex; int hlist_refcount; - /* - * Recursion avoidance: - * - * task, softirq, irq, nmi context - */ - int recursion[4]; + /* Recursion avoidance in each contexts */ + int recursion[PERF_NR_CONTEXTS]; }; struct perf_output_handle { diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 75ab8a2df6b2..f416aef242c3 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1772,7 +1772,7 @@ struct callchain_cpus_entries { struct perf_callchain_entry *cpu_entries[0]; }; -static DEFINE_PER_CPU(int, callchain_recursion[4]); +static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); static atomic_t nr_callchain_events; static DEFINE_MUTEX(callchain_mutex); struct callchain_cpus_entries *callchain_cpus_entries; @@ -1828,7 +1828,7 @@ static int alloc_callchain_buffers(void) if (!entries) return -ENOMEM; - size = sizeof(struct perf_callchain_entry) * 4; + size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; for_each_possible_cpu(cpu) { entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 000e6e85b445..db2eae2efcf2 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -9,7 +9,7 @@ #include #include "trace.h" -static char *perf_trace_buf[4]; +static char *perf_trace_buf[PERF_NR_CONTEXTS]; /* * Force it to be aligned to unsigned long to avoid misaligned accesses @@ -45,7 +45,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, char *buf; int i; - for (i = 0; i < 4; i++) { + for (i = 0; i < PERF_NR_CONTEXTS; i++) { buf = (char *)alloc_percpu(perf_trace_t); if (!buf) goto fail; @@ -65,7 +65,7 @@ fail: if (!total_ref_count) { int i; - for (i = 0; i < 4; i++) { + for (i = 0; i < PERF_NR_CONTEXTS; i++) { free_percpu(perf_trace_buf[i]); perf_trace_buf[i] = NULL; } @@ -140,7 +140,7 @@ void perf_trace_destroy(struct perf_event *p_event) tp_event->perf_events = NULL; if (!--total_ref_count) { - for (i = 0; i < 4; i++) { + for (i = 0; i < PERF_NR_CONTEXTS; i++) { free_percpu(perf_trace_buf[i]); perf_trace_buf[i] = NULL; } -- cgit v1.2.3 From 6016ee13db518ab1cd0cbf43fc2ad5712021e338 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 11 Aug 2010 12:47:59 +0900 Subject: perf, tracing: add missing __percpu markups ftrace_event_call->perf_events, perf_trace_buf, fgraph_data->cpu_data and some local variables are percpu pointers missing __percpu markups. Add them. Signed-off-by: Namhyung Kim Acked-by: Tejun Heo Cc: Steven Rostedt Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Stephane Eranian LKML-Reference: <1281498479-28551-1-git-send-email-namhyung@gmail.com> Signed-off-by: Frederic Weisbecker --- include/linux/ftrace_event.h | 4 ++-- kernel/trace/trace_event_perf.c | 15 ++++++++------- kernel/trace/trace_functions_graph.c | 2 +- 3 files changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 02b8b24f8f51..5f8ad7bec636 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -191,8 +191,8 @@ struct ftrace_event_call { unsigned int flags; #ifdef CONFIG_PERF_EVENTS - int perf_refcount; - struct hlist_head *perf_events; + int perf_refcount; + struct hlist_head __percpu *perf_events; #endif }; diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index db2eae2efcf2..92f5477a006a 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -9,7 +9,7 @@ #include #include "trace.h" -static char *perf_trace_buf[PERF_NR_CONTEXTS]; +static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; /* * Force it to be aligned to unsigned long to avoid misaligned accesses @@ -24,7 +24,7 @@ static int total_ref_count; static int perf_trace_event_init(struct ftrace_event_call *tp_event, struct perf_event *p_event) { - struct hlist_head *list; + struct hlist_head __percpu *list; int ret = -ENOMEM; int cpu; @@ -42,11 +42,11 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, tp_event->perf_events = list; if (!total_ref_count) { - char *buf; + char __percpu *buf; int i; for (i = 0; i < PERF_NR_CONTEXTS; i++) { - buf = (char *)alloc_percpu(perf_trace_t); + buf = (char __percpu *)alloc_percpu(perf_trace_t); if (!buf) goto fail; @@ -102,13 +102,14 @@ int perf_trace_init(struct perf_event *p_event) int perf_trace_enable(struct perf_event *p_event) { struct ftrace_event_call *tp_event = p_event->tp_event; + struct hlist_head __percpu *pcpu_list; struct hlist_head *list; - list = tp_event->perf_events; - if (WARN_ON_ONCE(!list)) + pcpu_list = tp_event->perf_events; + if (WARN_ON_ONCE(!pcpu_list)) return -EINVAL; - list = this_cpu_ptr(list); + list = this_cpu_ptr(pcpu_list); hlist_add_head_rcu(&p_event->hlist_entry, list); return 0; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 6bff23625781..fcb5a542cd21 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -23,7 +23,7 @@ struct fgraph_cpu_data { }; struct fgraph_data { - struct fgraph_cpu_data *cpu_data; + struct fgraph_cpu_data __percpu *cpu_data; /* Place to preserve last processed entry. */ struct ftrace_graph_ent_entry ent; -- cgit v1.2.3 From 277b199800ac90811ac86d215063df1984f51619 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Fri, 20 Aug 2010 11:03:51 +0800 Subject: lockup_detector: Make callback function static watchdog_overflow_callback() is only used in kernel/watchdog.c. Signed-off-by: Lin Ming Cc: Peter Zijlstra Cc: Don Zickus LKML-Reference: <1282273431.16443.32.camel@minggr.sh.intel.com> Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 613bc1f04610..b60e2a869bba 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -202,7 +202,7 @@ static struct perf_event_attr wd_hw_attr = { }; /* Callback function for perf event subsystem */ -void watchdog_overflow_callback(struct perf_event *event, int nmi, +static void watchdog_overflow_callback(struct perf_event *event, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { -- cgit v1.2.3 From 861d034ee814917a83bd5de4b26e3b8336ddeeb8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Aug 2010 13:31:43 +0200 Subject: sched: Fix rq->clock synchronization when migrating tasks sched_fork() -- we do task placement in ->task_fork_fair() ensure we update_rq_clock() so we work with current time. We leave the vruntime in relative state, so the time delay until wake_up_new_task() doesn't matter. wake_up_new_task() -- Since task_fork_fair() left p->vruntime in relative state we can safely migrate, the activate_task() on the remote rq will call update_rq_clock() and causes the clock to be synced (enough). Tested-by: Jack Daniel Tested-by: Philby John Signed-off-by: Peter Zijlstra LKML-Reference: <1281002322.1923.1708.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 806d1b227a21..ab661ebc4895 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3752,6 +3752,8 @@ static void task_fork_fair(struct task_struct *p) raw_spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); + if (unlikely(task_cpu(p) != this_cpu)) __set_task_cpu(p, this_cpu); -- cgit v1.2.3 From c6db67cda735d8ace5f19c3831240e1408679790 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 20 Aug 2010 11:49:15 +0200 Subject: watchdog: Don't throttle the watchdog Stephane reported that when the machine locks up, the regular ticks, which are responsible to resetting the throttle count, stop too. Hence the NMI watchdog can end up being throttled before it reports on the locked up state, and we end up being sad.. Cure this by having the watchdog overflow reset its own throttle count. Reported-by: Stephane Eranian Tested-by: Stephane Eranian Cc: Don Zickus Cc: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1282215916.1926.4696.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 613bc1f04610..0d53c8e853b1 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -206,6 +206,9 @@ void watchdog_overflow_callback(struct perf_event *event, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { + /* Ensure the watchdog never gets throttled */ + event->hw.interrupts = 0; + if (__get_cpu_var(watchdog_nmi_touch) == true) { __get_cpu_var(watchdog_nmi_touch) = false; return; -- cgit v1.2.3 From 9d0f4dcc5c4d1c5dd01172172684a45b5f49d740 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 18 Aug 2010 15:00:27 -0700 Subject: mutex: Improve the scalability of optimistic spinning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a scalability issue for current implementation of optimistic mutex spin in the kernel. It is found on a 8 node 64 core Nehalem-EX system (HT mode). The intention of the optimistic mutex spin is to busy wait and spin on a mutex if the owner of the mutex is running, in the hope that the mutex will be released soon and be acquired, without the thread trying to acquire mutex going to sleep. However, when we have a large number of threads, contending for the mutex, we could have the mutex grabbed by other thread, and then another ……, and we will keep spinning, wasting cpu cycles and adding to the contention. One possible fix is to quit spinning and put the current thread on wait-list if mutex lock switch to a new owner while we spin, indicating heavy contention (see the patch included). I did some testing on a 8 socket Nehalem-EX system with a total of 64 cores. Using Ingo's test-mutex program that creates/delete files with 256 threads (http://lkml.org/lkml/2006/1/8/50) , I see the following speed up after putting in the mutex spin fix: ./mutex-test V 256 10 Ops/sec 2.6.34 62864 With fix 197200 Repeating the test with Aim7 fserver workload, again there is a speed up with the fix: Jobs/min 2.6.34 91657 With fix 149325 To look at the impact on the distribution of mutex acquisition time, I collected the mutex acquisition time on Aim7 fserver workload with some instrumentation. The average acquisition time is reduced by 48% and number of contentions reduced by 32%. #contentions Time to acquire mutex (cycles) 2.6.34 72973 44765791 With fix 49210 23067129 The histogram of mutex acquisition time is listed below. The acquisition time is in 2^bin cycles. We see that without the fix, the acquisition time is mostly around 2^26 cycles. With the fix, we the distribution get spread out a lot more towards the lower cycles, starting from 2^13. However, there is an increase of the tail distribution with the fix at 2^28 and 2^29 cycles. It seems a small price to pay for the reduced average acquisition time and also getting the cpu to do useful work. Mutex acquisition time distribution (acq time = 2^bin cycles): 2.6.34 With Fix bin #occurrence % #occurrence % 11 2 0.00% 120 0.24% 12 10 0.01% 790 1.61% 13 14 0.02% 2058 4.18% 14 86 0.12% 3378 6.86% 15 393 0.54% 4831 9.82% 16 710 0.97% 4893 9.94% 17 815 1.12% 4667 9.48% 18 790 1.08% 5147 10.46% 19 580 0.80% 6250 12.70% 20 429 0.59% 6870 13.96% 21 311 0.43% 1809 3.68% 22 255 0.35% 2305 4.68% 23 317 0.44% 916 1.86% 24 610 0.84% 233 0.47% 25 3128 4.29% 95 0.19% 26 63902 87.69% 122 0.25% 27 619 0.85% 286 0.58% 28 0 0.00% 3536 7.19% 29 0 0.00% 903 1.83% 30 0 0.00% 0 0.00% I've done similar experiments with 2.6.35 kernel on smaller boxes as well. One is on a dual-socket Westmere box (12 cores total, with HT). Another experiment is on an old dual-socket Core 2 box (4 cores total, no HT) On the 12-core Westmere box, I see a 250% increase for Ingo's mutex-test program with my mutex patch but no significant difference in aim7's fserver workload. On the 4-core Core 2 box, I see the difference with the patch for both mutex-test and aim7 fserver are negligible. So far, it seems like the patch has not caused regression on smaller systems. Signed-off-by: Tim Chen Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Cc: Frederic Weisbecker Cc: # .35.x LKML-Reference: <1282168827.9542.72.camel@schen9-DESK> Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 41541d79e3c8..09b574e7f4df 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3865,8 +3865,16 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) /* * Owner changed, break to re-assess state. */ - if (lock->owner != owner) + if (lock->owner != owner) { + /* + * If the lock has switched to a different owner, + * we likely have heavy contention. Return 0 to quit + * optimistic spinning and not contend further: + */ + if (lock->owner) + return 0; break; + } /* * Is that owner really running on that cpu? -- cgit v1.2.3