From fe4a330885aee20f233de36085fb15c38094e635 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 20 Nov 2011 20:44:06 +0100 Subject: perf, x86: Implement user-space RDPMC support, to allow fast, user-space access to self-monitoring counters Implement a correct pmu::event_idx for the x86 counter index rules and set CR4.PCE on CPU_STARTING. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Arun Sharma Cc: Thomas Gleixner Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/n/tip-mwxab34dibqgzk5zywutfnha@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch/x86/kernel/cpu/perf_event.c') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5adce1040b11..53b569910175 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1210,6 +1210,7 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) break; case CPU_STARTING: + set_in_cr4(X86_CR4_PCE); if (x86_pmu.cpu_starting) x86_pmu.cpu_starting(cpu); break; @@ -1542,6 +1543,18 @@ static int x86_pmu_event_init(struct perf_event *event) return err; } +static int x86_pmu_event_idx(struct perf_event *event) +{ + int idx = event->hw.idx; + + if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) { + idx -= X86_PMC_IDX_FIXED; + idx |= 1 << 30; + } + + return idx + 1; +} + static struct pmu pmu = { .pmu_enable = x86_pmu_enable, .pmu_disable = x86_pmu_disable, @@ -1557,6 +1570,8 @@ static struct pmu pmu = { .start_txn = x86_pmu_start_txn, .cancel_txn = x86_pmu_cancel_txn, .commit_txn = x86_pmu_commit_txn, + + .event_idx = x86_pmu_event_idx, }; /* -- cgit v1.2.3 From 0c9d42ed4cee2aa1dfc3a260b741baae8615744f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 20 Nov 2011 23:30:47 +0100 Subject: perf, x86: Provide means for disabling userspace RDPMC Allow the disabling of RDPMC via a pmu specific attribute: echo 0 > /sys/bus/event_source/devices/cpu/rdpmc Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Arun Sharma Link: http://lkml.kernel.org/n/tip-pqeog465zo5hsimtkfz73f27@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 55 +++++++++++++++++++++++++++++++++++++++- arch/x86/kernel/cpu/perf_event.h | 8 ++++++ include/linux/perf_event.h | 1 + kernel/events/core.c | 1 + 4 files changed, 64 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel/cpu/perf_event.c') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 53b569910175..116b040a73a8 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -1210,7 +1211,8 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) break; case CPU_STARTING: - set_in_cr4(X86_CR4_PCE); + if (x86_pmu.attr_rdpmc) + set_in_cr4(X86_CR4_PCE); if (x86_pmu.cpu_starting) x86_pmu.cpu_starting(cpu); break; @@ -1320,6 +1322,8 @@ static int __init init_hw_perf_events(void) } } + x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ + pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.cntval_bits); pr_info("... generic registers: %d\n", x86_pmu.num_counters); @@ -1555,10 +1559,59 @@ static int x86_pmu_event_idx(struct perf_event *event) return idx + 1; } +static ssize_t get_attr_rdpmc(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); +} + +static void change_rdpmc(void *info) +{ + bool enable = !!(unsigned long)info; + + if (enable) + set_in_cr4(X86_CR4_PCE); + else + clear_in_cr4(X86_CR4_PCE); +} + +static ssize_t set_attr_rdpmc(struct device *cdev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long val = simple_strtoul(buf, NULL, 0); + + if (!!val != !!x86_pmu.attr_rdpmc) { + x86_pmu.attr_rdpmc = !!val; + smp_call_function(change_rdpmc, (void *)val, 1); + } + + return count; +} + +static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc); + +static struct attribute *x86_pmu_attrs[] = { + &dev_attr_rdpmc.attr, + NULL, +}; + +static struct attribute_group x86_pmu_attr_group = { + .attrs = x86_pmu_attrs, +}; + +static const struct attribute_group *x86_pmu_attr_groups[] = { + &x86_pmu_attr_group, + NULL, +}; + static struct pmu pmu = { .pmu_enable = x86_pmu_enable, .pmu_disable = x86_pmu_disable, + .attr_groups = x86_pmu_attr_groups, + .event_init = x86_pmu_event_init, .add = x86_pmu_add, diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 8944062f46e2..513d617b93c4 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -307,6 +307,14 @@ struct x86_pmu { struct x86_pmu_quirk *quirks; int perfctr_second_write; + /* + * sysfs attrs + */ + int attr_rdpmc; + + /* + * CPU Hotplug hooks + */ int (*cpu_prepare)(int cpu); void (*cpu_starting)(int cpu); void (*cpu_dying)(int cpu); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 02545e6df95b..5311b79fe62c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -615,6 +615,7 @@ struct pmu { struct list_head entry; struct device *dev; + const struct attribute_group **attr_groups; char *name; int type; diff --git a/kernel/events/core.c b/kernel/events/core.c index 05affc3878ff..dcd4049e92fc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5505,6 +5505,7 @@ static int pmu_dev_alloc(struct pmu *pmu) if (!pmu->dev) goto out; + pmu->dev->groups = pmu->attr_groups; device_initialize(pmu->dev); ret = dev_set_name(pmu->dev, "%s", pmu->name); if (ret) -- cgit v1.2.3 From e3f3541c19c89a4daae39300defba68943301949 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Nov 2011 11:43:53 +0100 Subject: perf: Extend the mmap control page with time (TSC) fields Extend the mmap control page with fields so that userspace can compute time deltas relative to the provided time fields. Currently only implemented for x86 with constant and nonstop TSC. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Arun Sharma Link: http://lkml.kernel.org/n/tip-3u1jucza77j3wuvs0x2bic0f@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 14 ++++++++++++++ include/linux/perf_event.h | 4 +++- kernel/events/core.c | 21 ++++++++++++++------- 3 files changed, 31 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel/cpu/perf_event.c') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 116b040a73a8..f8bddb5b0600 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "perf_event.h" @@ -1627,6 +1628,19 @@ static struct pmu pmu = { .event_idx = x86_pmu_event_idx, }; +void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) +{ + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + return; + + if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) + return; + + userpg->time_mult = this_cpu_read(cyc2ns); + userpg->time_shift = CYC2NS_SCALE_FACTOR; + userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; +} + /* * callchain support */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5311b79fe62c..0b91db2522cc 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -291,12 +291,14 @@ struct perf_event_mmap_page { __s64 offset; /* add to hardware event value */ __u64 time_enabled; /* time event active */ __u64 time_running; /* time event on cpu */ + __u32 time_mult, time_shift; + __u64 time_offset; /* * Hole for extension of the self monitor capabilities */ - __u64 __reserved[123]; /* align to 1k */ + __u64 __reserved[121]; /* align to 1k */ /* * Control data for the mmap() data buffer. diff --git a/kernel/events/core.c b/kernel/events/core.c index dcd4049e92fc..3a9c7d81afbf 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3220,17 +3220,22 @@ static int perf_event_index(struct perf_event *event) } static void calc_timer_values(struct perf_event *event, + u64 *now, u64 *enabled, u64 *running) { - u64 now, ctx_time; + u64 ctx_time; - now = perf_clock(); - ctx_time = event->shadow_ctx_time + now; + *now = perf_clock(); + ctx_time = event->shadow_ctx_time + *now; *enabled = ctx_time - event->tstamp_enabled; *running = ctx_time - event->tstamp_running; } +void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) +{ +} + /* * Callers need to ensure there can be no nesting of this function, otherwise * the seqlock logic goes bad. We can not serialize this because the arch @@ -3240,7 +3245,7 @@ void perf_event_update_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; struct ring_buffer *rb; - u64 enabled, running; + u64 enabled, running, now; rcu_read_lock(); /* @@ -3252,7 +3257,7 @@ void perf_event_update_userpage(struct perf_event *event) * because of locking issue as we can be called in * NMI context */ - calc_timer_values(event, &enabled, &running); + calc_timer_values(event, &now, &enabled, &running); rb = rcu_dereference(event->rb); if (!rb) goto unlock; @@ -3277,6 +3282,8 @@ void perf_event_update_userpage(struct perf_event *event) userpg->time_running = running + atomic64_read(&event->child_total_time_running); + perf_update_user_clock(userpg, now); + barrier(); ++userpg->lock; preempt_enable(); @@ -3763,7 +3770,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) { - u64 enabled = 0, running = 0; + u64 enabled = 0, running = 0, now; u64 read_format = event->attr.read_format; /* @@ -3776,7 +3783,7 @@ static void perf_output_read(struct perf_output_handle *handle, * NMI context */ if (read_format & PERF_FORMAT_TOTAL_TIMES) - calc_timer_values(event, &enabled, &running); + calc_timer_values(event, &now, &enabled, &running); if (event->attr.read_format & PERF_FORMAT_GROUP) perf_output_read_group(handle, event, enabled, running); -- cgit v1.2.3 From 84f2b9b2edc09595569c7397cc3c888764ffd78b Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 2 Feb 2012 12:04:01 +0100 Subject: perf: Remove deprecated WARN_ON_ONCE() With the new throttling/unthrottling code introduced with commit: e050e3f0a71b ("perf: Fix broken interrupt rate throttling") we occasionally hit two WARN_ON_ONCE() checks in: - intel_pmu_pebs_enable() - intel_pmu_lbr_enable() - x86_pmu_start() The assertions are no longer problematic. There is a valid path where they can trigger but it is harmless. The assertion can be triggered with: $ perf record -e instructions:pp .... Leading to paths: intel_pmu_pebs_enable intel_pmu_enable_event x86_perf_event_set_period x86_pmu_start perf_adjust_freq_unthr_context perf_event_task_tick scheduler_tick And: intel_pmu_lbr_enable intel_pmu_enable_event x86_perf_event_set_period x86_pmu_start perf_adjust_freq_unthr_context. perf_event_task_tick scheduler_tick cpuc->enabled is always on because when we get to perf_adjust_freq_unthr_context() the PMU is not totally disabled. Furthermore when we need to adjust a period, we only stop the event we need to change and not the entire PMU. Thus, when we re-enable, cpuc->enabled is already set. Note that when we stop the event, both pebs and lbr are stopped if necessary (and possible). Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/20120202110401.GA30911@quad Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 3 --- arch/x86/kernel/cpu/perf_event_intel_ds.c | 1 - arch/x86/kernel/cpu/perf_event_intel_lbr.c | 2 -- 3 files changed, 6 deletions(-) (limited to 'arch/x86/kernel/cpu/perf_event.c') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5adce1040b11..2a30e5ae6acf 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -986,9 +986,6 @@ static void x86_pmu_start(struct perf_event *event, int flags) struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx = event->hw.idx; - if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) - return; - if (WARN_ON_ONCE(idx == -1)) return; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 73da6b64f5b7..d6bd49faa40c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -439,7 +439,6 @@ void intel_pmu_pebs_enable(struct perf_event *event) hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; cpuc->pebs_enabled |= 1ULL << hwc->idx; - WARN_ON_ONCE(cpuc->enabled); if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1) intel_pmu_lbr_enable(event); diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 3fab3de3ce96..47a7e63bfe54 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -72,8 +72,6 @@ void intel_pmu_lbr_enable(struct perf_event *event) if (!x86_pmu.lbr_nr) return; - WARN_ON_ONCE(cpuc->enabled); - /* * Reset the LBR stack if we changed task context to * avoid data leaks. -- cgit v1.2.3 From d1a797f388d6d30fa502915d1b9937ed758b7137 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 19 Feb 2012 10:06:34 -0800 Subject: x32: Handle process creation Allow an x32 process to be started. Originally-by: H. J. Lu Signed-off-by: H. Peter Anvin Cc: Peter Zijlstra --- arch/x86/include/asm/compat.h | 26 ++++++++++++++++++++++++-- arch/x86/include/asm/elf.h | 25 +++++++++++++++++++++---- arch/x86/kernel/cpu/perf_event.c | 4 +++- arch/x86/kernel/entry_64.S | 15 +++++++++++++++ arch/x86/kernel/process_64.c | 23 ++++++++++++++++------- 5 files changed, 79 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel/cpu/perf_event.c') diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 7938b84e4506..e7f68b49c01a 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -6,6 +6,7 @@ */ #include #include +#include #include #include @@ -187,7 +188,20 @@ struct compat_shmid64_ds { /* * The type of struct elf_prstatus.pr_reg in compatible core dumps. */ +#ifdef CONFIG_X86_X32_ABI +typedef struct user_regs_struct compat_elf_gregset_t; + +#define PR_REG_SIZE(S) (test_thread_flag(TIF_IA32) ? 68 : 216) +#define PRSTATUS_SIZE(S) (test_thread_flag(TIF_IA32) ? 144 : 296) +#define SET_PR_FPVALID(S,V) \ + do { *(int *) (((void *) &((S)->pr_reg)) + PR_REG_SIZE(0)) = (V); } \ + while (0) + +#define COMPAT_USE_64BIT_TIME \ + (!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)) +#else typedef struct user_regs_struct32 compat_elf_gregset_t; +#endif /* * A pointer passed in from user mode. This should not @@ -209,8 +223,16 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr) static inline void __user *arch_compat_alloc_user_space(long len) { - struct pt_regs *regs = task_pt_regs(current); - return (void __user *)regs->sp - len; + compat_uptr_t sp; + + if (test_thread_flag(TIF_IA32)) { + sp = task_pt_regs(current)->sp; + } else { + /* -128 for the x32 ABI redzone */ + sp = percpu_read(old_rsp) - 128; + } + + return (void __user *)round_down(sp - len, 16); } static inline bool is_compat_task(void) diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 410fa6a219f6..83aabea95dd7 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -156,7 +156,12 @@ do { \ #define elf_check_arch(x) \ ((x)->e_machine == EM_X86_64) -#define compat_elf_check_arch(x) elf_check_arch_ia32(x) +#define compat_elf_check_arch(x) \ + (elf_check_arch_ia32(x) || (x)->e_machine == EM_X86_64) + +#if __USER32_DS != __USER_DS +# error "The following code assumes __USER32_DS == __USER_DS" +#endif static inline void elf_common_init(struct thread_struct *t, struct pt_regs *regs, const u16 ds) @@ -179,8 +184,9 @@ static inline void elf_common_init(struct thread_struct *t, void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp); #define compat_start_thread start_thread_ia32 -void set_personality_ia32(void); -#define COMPAT_SET_PERSONALITY(ex) set_personality_ia32() +void set_personality_ia32(bool); +#define COMPAT_SET_PERSONALITY(ex) \ + set_personality_ia32((ex).e_machine == EM_X86_64) #define COMPAT_ELF_PLATFORM ("i686") @@ -296,9 +302,20 @@ do { \ (unsigned long)current->mm->context.vdso); \ } while (0) +#define ARCH_DLINFO_X32 \ +do { \ + if (vdso_enabled) \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, \ + (unsigned long)current->mm->context.vdso); \ +} while (0) + #define AT_SYSINFO 32 -#define COMPAT_ARCH_DLINFO ARCH_DLINFO_IA32(sysctl_vsyscall32) +#define COMPAT_ARCH_DLINFO \ +if (test_thread_flag(TIF_X32)) \ + ARCH_DLINFO_X32; \ +else \ + ARCH_DLINFO_IA32(sysctl_vsyscall32) #define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5adce1040b11..63c0e058a405 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include @@ -1595,6 +1594,9 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) } #ifdef CONFIG_COMPAT + +#include + static inline int perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) { diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 53dc821f0a62..9e036f0ce5e0 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -763,6 +763,21 @@ ENTRY(stub_x32_rt_sigreturn) CFI_ENDPROC END(stub_x32_rt_sigreturn) +ENTRY(stub_x32_execve) + CFI_STARTPROC + addq $8, %rsp + PARTIAL_FRAME 0 + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + movq %rsp, %rcx + call sys32_execve + RESTORE_TOP_OF_STACK %r11 + movq %rax,RAX(%rsp) + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_x32_execve) + #endif /* diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 5fe2fbaa56ba..a0701da2bd18 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -364,7 +364,9 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) { start_thread_common(regs, new_ip, new_sp, - __USER32_CS, __USER32_DS, __USER32_DS); + test_thread_flag(TIF_X32) + ? __USER_CS : __USER32_CS, + __USER_DS, __USER_DS); } #endif @@ -508,6 +510,7 @@ void set_personality_64bit(void) /* Make sure to be in 64bit mode */ clear_thread_flag(TIF_IA32); + clear_thread_flag(TIF_X32); clear_thread_flag(TIF_ADDR32); clear_thread_flag(TIF_X32); @@ -522,22 +525,28 @@ void set_personality_64bit(void) current->personality &= ~READ_IMPLIES_EXEC; } -void set_personality_ia32(void) +void set_personality_ia32(bool x32) { /* inherit personality from parent */ /* Make sure to be in 32bit mode */ - set_thread_flag(TIF_IA32); set_thread_flag(TIF_ADDR32); - clear_thread_flag(TIF_X32); - current->personality |= force_personality32; /* Mark the associated mm as containing 32-bit tasks. */ if (current->mm) current->mm->context.ia32_compat = 1; - /* Prepare the first "return" to user space */ - current_thread_info()->status |= TS_COMPAT; + if (x32) { + clear_thread_flag(TIF_IA32); + set_thread_flag(TIF_X32); + current->personality &= ~READ_IMPLIES_EXEC; + } else { + set_thread_flag(TIF_IA32); + clear_thread_flag(TIF_X32); + current->personality |= force_personality32; + /* Prepare the first "return" to user space */ + current_thread_info()->status |= TS_COMPAT; + } } unsigned long get_wchan(struct task_struct *p) -- cgit v1.2.3 From b36817e8863090f1f24e538106ca50fa1d9e4003 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:20:53 +0100 Subject: perf/x86: Add Intel LBR sharing logic The Intel LBR on some recent processor is capable of filtering branches by type. The filter is configurable via the LBR_SELECT MSR register. There are limitation on how this register can be used. On Nehalem/Westmere, the LBR_SELECT is shared by the two HT threads when HT is on. It is private to each core when HT is off. On SandyBridge, the LBR_SELECT register is private to each thread when HT is on. It is private to each core when HT is off. The kernel must manage the sharing of LBR_SELECT. It allows multiple users on the same logical CPU to use LBR_SELECT as long as they program it with the same value. Across sibling CPUs (HT threads), the same restriction applies on NHM/WSM. This patch implements this sharing logic by leveraging the mechanism put in place for managing the offcore_response shared MSR. We modify __intel_shared_reg_get_constraints() to cause x86_get_event_constraint() to be called because LBR may be associated with events that may be counter constrained. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-4-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 4 ++ arch/x86/kernel/cpu/perf_event.h | 4 ++ arch/x86/kernel/cpu/perf_event_intel.c | 70 +++++++++++++++++++++------------- 3 files changed, 52 insertions(+), 26 deletions(-) (limited to 'arch/x86/kernel/cpu/perf_event.c') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index f8bddb5b0600..377931354ac7 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -426,6 +426,10 @@ static int __x86_pmu_event_init(struct perf_event *event) /* mark unused */ event->hw.extra_reg.idx = EXTRA_REG_NONE; + /* mark not used */ + event->hw.extra_reg.idx = EXTRA_REG_NONE; + event->hw.branch_reg.idx = EXTRA_REG_NONE; + return x86_pmu.hw_config(event); } diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 82db83b5c3bc..9b9c580a7ab8 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -33,6 +33,7 @@ enum extra_reg_type { EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */ EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */ + EXTRA_REG_LBR = 2, /* lbr_select */ EXTRA_REG_MAX /* number of entries needed */ }; @@ -130,6 +131,7 @@ struct cpu_hw_events { void *lbr_context; struct perf_branch_stack lbr_stack; struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; + struct er_account *lbr_sel; /* * Intel host/guest exclude bits @@ -342,6 +344,8 @@ struct x86_pmu { */ unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ int lbr_nr; /* hardware stack size */ + u64 lbr_sel_mask; /* LBR_SELECT valid bits */ + const int *lbr_sel_map; /* lbr_select mappings */ /* * Extra registers for events diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 3bd37bdf1b8e..97f7bb587519 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1123,17 +1123,17 @@ static bool intel_try_alt_er(struct perf_event *event, int orig_idx) */ static struct event_constraint * __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, - struct perf_event *event) + struct perf_event *event, + struct hw_perf_event_extra *reg) { struct event_constraint *c = &emptyconstraint; - struct hw_perf_event_extra *reg = &event->hw.extra_reg; struct er_account *era; unsigned long flags; int orig_idx = reg->idx; /* already allocated shared msr */ if (reg->alloc) - return &unconstrained; + return NULL; /* call x86_get_event_constraint() */ again: era = &cpuc->shared_regs->regs[reg->idx]; @@ -1156,14 +1156,10 @@ again: reg->alloc = 1; /* - * All events using extra_reg are unconstrained. - * Avoids calling x86_get_event_constraints() - * - * Must revisit if extra_reg controlling events - * ever have constraints. Worst case we go through - * the regular event constraint table. + * need to call x86_get_event_constraint() + * to check if associated event has constraints */ - c = &unconstrained; + c = NULL; } else if (intel_try_alt_er(event, orig_idx)) { raw_spin_unlock_irqrestore(&era->lock, flags); goto again; @@ -1200,11 +1196,23 @@ static struct event_constraint * intel_shared_regs_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { - struct event_constraint *c = NULL; - - if (event->hw.extra_reg.idx != EXTRA_REG_NONE) - c = __intel_shared_reg_get_constraints(cpuc, event); - + struct event_constraint *c = NULL, *d; + struct hw_perf_event_extra *xreg, *breg; + + xreg = &event->hw.extra_reg; + if (xreg->idx != EXTRA_REG_NONE) { + c = __intel_shared_reg_get_constraints(cpuc, event, xreg); + if (c == &emptyconstraint) + return c; + } + breg = &event->hw.branch_reg; + if (breg->idx != EXTRA_REG_NONE) { + d = __intel_shared_reg_get_constraints(cpuc, event, breg); + if (d == &emptyconstraint) { + __intel_shared_reg_put_constraints(cpuc, xreg); + c = d; + } + } return c; } @@ -1252,6 +1260,10 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc, reg = &event->hw.extra_reg; if (reg->idx != EXTRA_REG_NONE) __intel_shared_reg_put_constraints(cpuc, reg); + + reg = &event->hw.branch_reg; + if (reg->idx != EXTRA_REG_NONE) + __intel_shared_reg_put_constraints(cpuc, reg); } static void intel_put_event_constraints(struct cpu_hw_events *cpuc, @@ -1431,7 +1443,7 @@ static int intel_pmu_cpu_prepare(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - if (!x86_pmu.extra_regs) + if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map)) return NOTIFY_OK; cpuc->shared_regs = allocate_shared_regs(cpu); @@ -1453,22 +1465,28 @@ static void intel_pmu_cpu_starting(int cpu) */ intel_pmu_lbr_reset(); - if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING)) + cpuc->lbr_sel = NULL; + + if (!cpuc->shared_regs) return; - for_each_cpu(i, topology_thread_cpumask(cpu)) { - struct intel_shared_regs *pc; + if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) { + for_each_cpu(i, topology_thread_cpumask(cpu)) { + struct intel_shared_regs *pc; - pc = per_cpu(cpu_hw_events, i).shared_regs; - if (pc && pc->core_id == core_id) { - cpuc->kfree_on_online = cpuc->shared_regs; - cpuc->shared_regs = pc; - break; + pc = per_cpu(cpu_hw_events, i).shared_regs; + if (pc && pc->core_id == core_id) { + cpuc->kfree_on_online = cpuc->shared_regs; + cpuc->shared_regs = pc; + break; + } } + cpuc->shared_regs->core_id = core_id; + cpuc->shared_regs->refcnt++; } - cpuc->shared_regs->core_id = core_id; - cpuc->shared_regs->refcnt++; + if (x86_pmu.lbr_sel_map) + cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR]; } static void intel_pmu_cpu_dying(int cpu) -- cgit v1.2.3 From ff3fb511ba377e8a0a7f553cc352237f70d08121 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:20:54 +0100 Subject: perf/x86: Sync branch stack sampling with precise_sampling If precise sampling is enabled on Intel x86 then perf_event uses PEBS. To correct for the off-by-one error of PEBS, perf_event uses LBR when precise_sample > 1. On Intel x86 PERF_SAMPLE_BRANCH_STACK is implemented using LBR, therefore both features must be coordinated as they may not configure LBR the same way. For PEBS, LBR needs to capture all branches at the priv level of the associated event. This patch checks that the branch type and priv level of BRANCH_STACK is compatible with that of the PEBS LBR requirement, thereby allowing: $ perf record -b any,u -e instructions:upp .... But: $ perf record -b any_call,u -e instructions:upp Is not possible. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-5-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 60 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'arch/x86/kernel/cpu/perf_event.c') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 377931354ac7..cea567483274 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -353,6 +353,36 @@ int x86_setup_perfctr(struct perf_event *event) return 0; } +/* + * check that branch_sample_type is compatible with + * settings needed for precise_ip > 1 which implies + * using the LBR to capture ALL taken branches at the + * priv levels of the measurement + */ +static inline int precise_br_compat(struct perf_event *event) +{ + u64 m = event->attr.branch_sample_type; + u64 b = 0; + + /* must capture all branches */ + if (!(m & PERF_SAMPLE_BRANCH_ANY)) + return 0; + + m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER; + + if (!event->attr.exclude_user) + b |= PERF_SAMPLE_BRANCH_USER; + + if (!event->attr.exclude_kernel) + b |= PERF_SAMPLE_BRANCH_KERNEL; + + /* + * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86 + */ + + return m == b; +} + int x86_pmu_hw_config(struct perf_event *event) { if (event->attr.precise_ip) { @@ -369,6 +399,36 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip > precise) return -EOPNOTSUPP; + /* + * check that PEBS LBR correction does not conflict with + * whatever the user is asking with attr->branch_sample_type + */ + if (event->attr.precise_ip > 1) { + u64 *br_type = &event->attr.branch_sample_type; + + if (has_branch_stack(event)) { + if (!precise_br_compat(event)) + return -EOPNOTSUPP; + + /* branch_sample_type is compatible */ + + } else { + /* + * user did not specify branch_sample_type + * + * For PEBS fixups, we capture all + * the branches at the priv level of the + * event. + */ + *br_type = PERF_SAMPLE_BRANCH_ANY; + + if (!event->attr.exclude_user) + *br_type |= PERF_SAMPLE_BRANCH_USER; + + if (!event->attr.exclude_kernel) + *br_type |= PERF_SAMPLE_BRANCH_KERNEL; + } + } } /* -- cgit v1.2.3 From d010b3326cf06b3406cdd88af16dcf4e4b6fec2e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:21:00 +0100 Subject: perf: Add callback to flush branch_stack on context switch With branch stack sampling, it is possible to filter by priv levels. In system-wide mode, that means it is possible to capture only user level branches. The builtin SW LBR filter needs to disassemble code based on LBR captured addresses. For that, it needs to know the task the addresses are associated with. Because of context switches, the content of the branch stack buffer may contain addresses from different tasks. We need a callback on context switch to either flush the branch stack or save it. This patch adds a new callback in struct pmu which is called during context switches. The callback is called only when necessary. That is when a system-wide context has, at least, one event which uses PERF_SAMPLE_BRANCH_STACK. The callback is never called for per-thread context. In this version, the Intel x86 code simply flushes (resets) the LBR on context switches (fills it with zeroes). Those zeroed branches are then filtered out by the SW filter. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-11-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 21 ++++++--- arch/x86/kernel/cpu/perf_event.h | 1 + arch/x86/kernel/cpu/perf_event_intel.c | 13 ++++++ include/linux/perf_event.h | 9 +++- kernel/events/core.c | 85 ++++++++++++++++++++++++++++++++++ 5 files changed, 121 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel/cpu/perf_event.c') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index cea567483274..0a18d16cb58d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1671,25 +1671,32 @@ static const struct attribute_group *x86_pmu_attr_groups[] = { NULL, }; +static void x86_pmu_flush_branch_stack(void) +{ + if (x86_pmu.flush_branch_stack) + x86_pmu.flush_branch_stack(); +} + static struct pmu pmu = { - .pmu_enable = x86_pmu_enable, - .pmu_disable = x86_pmu_disable, + .pmu_enable = x86_pmu_enable, + .pmu_disable = x86_pmu_disable, .attr_groups = x86_pmu_attr_groups, .event_init = x86_pmu_event_init, - .add = x86_pmu_add, - .del = x86_pmu_del, - .start = x86_pmu_start, - .stop = x86_pmu_stop, - .read = x86_pmu_read, + .add = x86_pmu_add, + .del = x86_pmu_del, + .start = x86_pmu_start, + .stop = x86_pmu_stop, + .read = x86_pmu_read, .start_txn = x86_pmu_start_txn, .cancel_txn = x86_pmu_cancel_txn, .commit_txn = x86_pmu_commit_txn, .event_idx = x86_pmu_event_idx, + .flush_branch_stack = x86_pmu_flush_branch_stack, }; void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index f104c054dc5c..74387c12dc72 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -324,6 +324,7 @@ struct x86_pmu { void (*cpu_starting)(int cpu); void (*cpu_dying)(int cpu); void (*cpu_dead)(int cpu); + void (*flush_branch_stack)(void); /* * Intel Arch Perfmon v2+ diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 7cc1e2dcc4dd..6627089232a7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1539,6 +1539,18 @@ static void intel_pmu_cpu_dying(int cpu) fini_debug_store_on_cpu(cpu); } +static void intel_pmu_flush_branch_stack(void) +{ + /* + * Intel LBR does not tag entries with the + * PID of the current task, then we need to + * flush it on ctxsw + * For now, we simply reset it + */ + if (x86_pmu.lbr_nr) + intel_pmu_lbr_reset(); +} + static __initconst const struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, @@ -1566,6 +1578,7 @@ static __initconst const struct x86_pmu intel_pmu = { .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, .guest_get_msrs = intel_guest_get_msrs, + .flush_branch_stack = intel_pmu_flush_branch_stack, }; static __init void intel_clovertown_quirk(void) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5fc494f4a094..fbbf5e598368 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -746,6 +746,11 @@ struct pmu { * if no implementation is provided it will default to: event->hw.idx + 1. */ int (*event_idx) (struct perf_event *event); /*optional */ + + /* + * flush branch stack on context-switches (needed in cpu-wide mode) + */ + void (*flush_branch_stack) (void); }; /** @@ -979,7 +984,8 @@ struct perf_event_context { u64 parent_gen; u64 generation; int pin_count; - int nr_cgroups; /* cgroup events present */ + int nr_cgroups; /* cgroup evts */ + int nr_branch_stack; /* branch_stack evt */ struct rcu_head rcu_head; }; @@ -1044,6 +1050,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); + struct perf_sample_data { u64 type; diff --git a/kernel/events/core.c b/kernel/events/core.c index 242bb51c67f2..c61234b1a988 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -137,6 +137,7 @@ enum event_type_t { */ struct static_key_deferred perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); +static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; @@ -888,6 +889,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) if (is_cgroup_event(event)) ctx->nr_cgroups++; + if (has_branch_stack(event)) + ctx->nr_branch_stack++; + list_add_rcu(&event->event_entry, &ctx->event_list); if (!ctx->nr_events) perf_pmu_rotate_start(ctx->pmu); @@ -1027,6 +1031,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) cpuctx->cgrp = NULL; } + if (has_branch_stack(event)) + ctx->nr_branch_stack--; + ctx->nr_events--; if (event->attr.inherit_stat) ctx->nr_stat--; @@ -2201,6 +2208,66 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, perf_pmu_rotate_start(ctx->pmu); } +/* + * When sampling the branck stack in system-wide, it may be necessary + * to flush the stack on context switch. This happens when the branch + * stack does not tag its entries with the pid of the current task. + * Otherwise it becomes impossible to associate a branch entry with a + * task. This ambiguity is more likely to appear when the branch stack + * supports priv level filtering and the user sets it to monitor only + * at the user level (which could be a useful measurement in system-wide + * mode). In that case, the risk is high of having a branch stack with + * branch from multiple tasks. Flushing may mean dropping the existing + * entries or stashing them somewhere in the PMU specific code layer. + * + * This function provides the context switch callback to the lower code + * layer. It is invoked ONLY when there is at least one system-wide context + * with at least one active event using taken branch sampling. + */ +static void perf_branch_stack_sched_in(struct task_struct *prev, + struct task_struct *task) +{ + struct perf_cpu_context *cpuctx; + struct pmu *pmu; + unsigned long flags; + + /* no need to flush branch stack if not changing task */ + if (prev == task) + return; + + local_irq_save(flags); + + rcu_read_lock(); + + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + /* + * check if the context has at least one + * event using PERF_SAMPLE_BRANCH_STACK + */ + if (cpuctx->ctx.nr_branch_stack > 0 + && pmu->flush_branch_stack) { + + pmu = cpuctx->ctx.pmu; + + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + + perf_pmu_disable(pmu); + + pmu->flush_branch_stack(); + + perf_pmu_enable(pmu); + + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); + } + } + + rcu_read_unlock(); + + local_irq_restore(flags); +} + /* * Called from scheduler to add the events of the current task * with interrupts disabled. @@ -2232,6 +2299,10 @@ void __perf_event_task_sched_in(struct task_struct *prev, */ if (atomic_read(&__get_cpu_var(perf_cgroup_events))) perf_cgroup_sched_in(prev, task); + + /* check for system-wide branch_stack events */ + if (atomic_read(&__get_cpu_var(perf_branch_stack_events))) + perf_branch_stack_sched_in(prev, task); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -2798,6 +2869,14 @@ static void free_event(struct perf_event *event) atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); static_key_slow_dec_deferred(&perf_sched_events); } + + if (has_branch_stack(event)) { + static_key_slow_dec_deferred(&perf_sched_events); + /* is system-wide event */ + if (!(event->attach_state & PERF_ATTACH_TASK)) + atomic_dec(&per_cpu(perf_branch_stack_events, + event->cpu)); + } } if (event->rb) { @@ -5924,6 +6003,12 @@ done: return ERR_PTR(err); } } + if (has_branch_stack(event)) { + static_key_slow_inc(&perf_sched_events.key); + if (!(event->attach_state & PERF_ATTACH_TASK)) + atomic_inc(&per_cpu(perf_branch_stack_events, + event->cpu)); + } } return event; -- cgit v1.2.3 From 641cc938815dfd09f8fa1ec72deb814f0938ac33 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 15 Mar 2012 20:09:14 +0100 Subject: perf: Adding sysfs group format attribute for pmu device Adding sysfs group 'format' attribute for pmu device that contains a syntax description on how to construct raw events. The event configuration is described in following struct pefr_event_attr attributes: config config1 config2 Each sysfs attribute within the format attribute group, describes mapping of name and bitfield definition within one of above attributes. eg: "/sys/.../format/event" contains "config:0-7" "/sys/.../format/umask" contains "config:8-15" "/sys/.../format/usr" contains "config:16" the attribute value syntax is: line: config ':' bits config: 'config' | 'config1' | 'config2" bits: bits ',' bit_term | bit_term bit_term: VALUE '-' VALUE | VALUE Adding format attribute definitions for x86 cpu pmus. Acked-by: Peter Zijlstra Signed-off-by: Peter Zijlstra Signed-off-by: Jiri Olsa Link: http://lkml.kernel.org/n/tip-vhdk5y2hyype9j63prymty36@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- .../testing/sysfs-bus-event_source-devices-format | 14 +++++++++ arch/x86/kernel/cpu/perf_event.c | 7 +++++ arch/x86/kernel/cpu/perf_event.h | 1 + arch/x86/kernel/cpu/perf_event_amd.c | 18 +++++++++++ arch/x86/kernel/cpu/perf_event_intel.c | 36 ++++++++++++++++++++++ arch/x86/kernel/cpu/perf_event_p6.c | 19 ++++++++++++ include/linux/perf_event.h | 14 +++++++++ 7 files changed, 109 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-bus-event_source-devices-format (limited to 'arch/x86/kernel/cpu/perf_event.c') diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-format b/Documentation/ABI/testing/sysfs-bus-event_source-devices-format new file mode 100644 index 000000000000..079afc71363d --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-format @@ -0,0 +1,14 @@ +Where: /sys/bus/event_source/devices//format +Date: January 2012 +Kernel Version: 3.3 +Contact: Jiri Olsa +Description: + Attribute group to describe the magic bits that go into + perf_event_attr::config[012] for a particular pmu. + Each attribute of this group defines the 'hardware' bitmask + we want to export, so that userspace can deal with sane + name/value pairs. + + Example: 'config1:1,6-10,44' + Defines contents of attribute that occupies bits 1,6-10,44 of + perf_event_attr::config1. diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 0a18d16cb58d..453ac9497574 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1314,6 +1314,11 @@ static void __init pmu_check_apic(void) pr_info("no hardware sampling interrupt available.\n"); } +static struct attribute_group x86_pmu_format_group = { + .name = "format", + .attrs = NULL, +}; + static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; @@ -1388,6 +1393,7 @@ static int __init init_hw_perf_events(void) } x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ + x86_pmu_format_group.attrs = x86_pmu.format_attrs; pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.cntval_bits); @@ -1668,6 +1674,7 @@ static struct attribute_group x86_pmu_attr_group = { static const struct attribute_group *x86_pmu_attr_groups[] = { &x86_pmu_attr_group, + &x86_pmu_format_group, NULL, }; diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 8484e77c211e..6638aaf54493 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -339,6 +339,7 @@ struct x86_pmu { * sysfs attrs */ int attr_rdpmc; + struct attribute **format_attrs; /* * CPU Hotplug hooks diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index dd002faff7a6..95e7fe1c5f0b 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -404,6 +404,21 @@ static void amd_pmu_cpu_dead(int cpu) } } +PMU_FORMAT_ATTR(event, "config:0-7,32-35"); +PMU_FORMAT_ATTR(umask, "config:8-15" ); +PMU_FORMAT_ATTR(edge, "config:18" ); +PMU_FORMAT_ATTR(inv, "config:23" ); +PMU_FORMAT_ATTR(cmask, "config:24-31" ); + +static struct attribute *amd_format_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + NULL, +}; + static __initconst const struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = x86_pmu_handle_irq, @@ -426,6 +441,8 @@ static __initconst const struct x86_pmu amd_pmu = { .get_event_constraints = amd_get_event_constraints, .put_event_constraints = amd_put_event_constraints, + .format_attrs = amd_format_attr, + .cpu_prepare = amd_pmu_cpu_prepare, .cpu_starting = amd_pmu_cpu_starting, .cpu_dead = amd_pmu_cpu_dead, @@ -596,6 +613,7 @@ static __initconst const struct x86_pmu amd_pmu_f15h = { .cpu_dead = amd_pmu_cpu_dead, #endif .cpu_starting = amd_pmu_cpu_starting, + .format_attrs = amd_format_attr, }; __init int amd_pmu_init(void) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 6a84e7f28f05..26b3e2fef104 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1431,6 +1431,24 @@ static void core_pmu_enable_all(int added) } } +PMU_FORMAT_ATTR(event, "config:0-7" ); +PMU_FORMAT_ATTR(umask, "config:8-15" ); +PMU_FORMAT_ATTR(edge, "config:18" ); +PMU_FORMAT_ATTR(pc, "config:19" ); +PMU_FORMAT_ATTR(any, "config:21" ); /* v3 + */ +PMU_FORMAT_ATTR(inv, "config:23" ); +PMU_FORMAT_ATTR(cmask, "config:24-31" ); + +static struct attribute *intel_arch_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_pc.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + NULL, +}; + static __initconst const struct x86_pmu core_pmu = { .name = "core", .handle_irq = x86_pmu_handle_irq, @@ -1455,6 +1473,7 @@ static __initconst const struct x86_pmu core_pmu = { .put_event_constraints = intel_put_event_constraints, .event_constraints = intel_core_event_constraints, .guest_get_msrs = core_guest_get_msrs, + .format_attrs = intel_arch_formats_attr, }; struct intel_shared_regs *allocate_shared_regs(int cpu) @@ -1553,6 +1572,21 @@ static void intel_pmu_flush_branch_stack(void) intel_pmu_lbr_reset(); } +PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); + +static struct attribute *intel_arch3_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_pc.attr, + &format_attr_any.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + + &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */ + NULL, +}; + static __initconst const struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, @@ -1576,6 +1610,8 @@ static __initconst const struct x86_pmu intel_pmu = { .get_event_constraints = intel_get_event_constraints, .put_event_constraints = intel_put_event_constraints, + .format_attrs = intel_arch3_formats_attr, + .cpu_prepare = intel_pmu_cpu_prepare, .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index c7181befecde..32bcfc7dd230 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -87,6 +87,23 @@ static void p6_pmu_enable_event(struct perf_event *event) (void)checking_wrmsrl(hwc->config_base, val); } +PMU_FORMAT_ATTR(event, "config:0-7" ); +PMU_FORMAT_ATTR(umask, "config:8-15" ); +PMU_FORMAT_ATTR(edge, "config:18" ); +PMU_FORMAT_ATTR(pc, "config:19" ); +PMU_FORMAT_ATTR(inv, "config:23" ); +PMU_FORMAT_ATTR(cmask, "config:24-31" ); + +static struct attribute *intel_p6_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_pc.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + NULL, +}; + static __initconst const struct x86_pmu p6_pmu = { .name = "p6", .handle_irq = x86_pmu_handle_irq, @@ -115,6 +132,8 @@ static __initconst const struct x86_pmu p6_pmu = { .cntval_mask = (1ULL << 32) - 1, .get_event_constraints = x86_get_event_constraints, .event_constraints = p6_event_constraints, + + .format_attrs = intel_p6_formats_attr, }; __init int p6_pmu_init(void) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index bd9f55a5958d..57ae485e80fc 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -550,6 +550,7 @@ struct perf_guest_info_callbacks { #include #include #include +#include #include #define PERF_MAX_STACK_DEPTH 255 @@ -1291,5 +1292,18 @@ do { \ register_cpu_notifier(&fn##_nb); \ } while (0) + +#define PMU_FORMAT_ATTR(_name, _format) \ +static ssize_t \ +_name##_show(struct device *dev, \ + struct device_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ + \ +static struct device_attribute format_attr_##_name = __ATTR_RO(_name) + #endif /* __KERNEL__ */ #endif /* _LINUX_PERF_EVENT_H */ -- cgit v1.2.3 From c7206205d00ab375839bd6c7ddb247d600693c09 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 22 Mar 2012 17:26:36 +0100 Subject: perf: Fix mmap_page capabilities and docs Complete the syscall-less self-profiling feature and address all complaints, namely: - capabilities, so we can detect what is actually available at runtime Add a capabilities field to perf_event_mmap_page to indicate what is actually available for use. - on x86: RDPMC weirdness due to being 40/48 bits and not sign-extending properly. - ABI documentation as to how all this stuff works. Also improve the documentation for the new features. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Vince Weaver Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1332433596.2487.33.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 10 ++++- include/linux/perf_event.h | 83 +++++++++++++++++++++++++++++++++++----- kernel/events/core.c | 4 +- 3 files changed, 84 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel/cpu/perf_event.c') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 453ac9497574..4ef8104958ee 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1622,6 +1622,9 @@ static int x86_pmu_event_idx(struct perf_event *event) { int idx = event->hw.idx; + if (!x86_pmu.attr_rdpmc) + return 0; + if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) { idx -= X86_PMC_IDX_FIXED; idx |= 1 << 30; @@ -1706,14 +1709,19 @@ static struct pmu pmu = { .flush_branch_stack = x86_pmu_flush_branch_stack, }; -void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) +void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { + userpg->cap_usr_time = 0; + userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc; + userpg->pmc_width = x86_pmu.cntval_bits; + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) return; if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) return; + userpg->cap_usr_time = 1; userpg->time_mult = this_cpu_read(cyc2ns); userpg->time_shift = CYC2NS_SCALE_FACTOR; userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 57ae485e80fc..ca9ed4e6a286 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -299,18 +299,31 @@ struct perf_event_mmap_page { /* * Bits needed to read the hw events in user-space. * - * u32 seq; - * s64 count; + * u32 seq, time_mult, time_shift, idx, width; + * u64 count, enabled, running; + * u64 cyc, time_offset; + * s64 pmc = 0; * * do { * seq = pc->lock; - * * barrier() - * if (pc->index) { - * count = pmc_read(pc->index - 1); - * count += pc->offset; - * } else - * goto regular_read; + * + * enabled = pc->time_enabled; + * running = pc->time_running; + * + * if (pc->cap_usr_time && enabled != running) { + * cyc = rdtsc(); + * time_offset = pc->time_offset; + * time_mult = pc->time_mult; + * time_shift = pc->time_shift; + * } + * + * idx = pc->index; + * count = pc->offset; + * if (pc->cap_usr_rdpmc && idx) { + * width = pc->pmc_width; + * pmc = rdpmc(idx - 1); + * } * * barrier(); * } while (pc->lock != seq); @@ -323,14 +336,57 @@ struct perf_event_mmap_page { __s64 offset; /* add to hardware event value */ __u64 time_enabled; /* time event active */ __u64 time_running; /* time event on cpu */ - __u32 time_mult, time_shift; + union { + __u64 capabilities; + __u64 cap_usr_time : 1, + cap_usr_rdpmc : 1, + cap_____res : 62; + }; + + /* + * If cap_usr_rdpmc this field provides the bit-width of the value + * read using the rdpmc() or equivalent instruction. This can be used + * to sign extend the result like: + * + * pmc <<= 64 - width; + * pmc >>= 64 - width; // signed shift right + * count += pmc; + */ + __u16 pmc_width; + + /* + * If cap_usr_time the below fields can be used to compute the time + * delta since time_enabled (in ns) using rdtsc or similar. + * + * u64 quot, rem; + * u64 delta; + * + * quot = (cyc >> time_shift); + * rem = cyc & ((1 << time_shift) - 1); + * delta = time_offset + quot * time_mult + + * ((rem * time_mult) >> time_shift); + * + * Where time_offset,time_mult,time_shift and cyc are read in the + * seqcount loop described above. This delta can then be added to + * enabled and possible running (if idx), improving the scaling: + * + * enabled += delta; + * if (idx) + * running += delta; + * + * quot = count / running; + * rem = count % running; + * count = quot * enabled + (rem * enabled) / running; + */ + __u16 time_shift; + __u32 time_mult; __u64 time_offset; /* * Hole for extension of the self monitor capabilities */ - __u64 __reserved[121]; /* align to 1k */ + __u64 __reserved[120]; /* align to 1k */ /* * Control data for the mmap() data buffer. @@ -347,6 +403,13 @@ struct perf_event_mmap_page { __u64 data_tail; /* user-space written tail */ }; +/* + * Build time assertion that we keep the data_head at the intended location. + * IOW, validation we got the __reserved[] size right. + */ +extern char __assert_mmap_data_head_offset + [1 - 2*!!(offsetof(struct perf_event_mmap_page, data_head) != 1024)]; + #define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0) #define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0) #define PERF_RECORD_MISC_KERNEL (1 << 0) diff --git a/kernel/events/core.c b/kernel/events/core.c index c61234b1a988..dc3b05272511 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3348,7 +3348,7 @@ static void calc_timer_values(struct perf_event *event, *running = ctx_time - event->tstamp_running; } -void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) +void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { } @@ -3398,7 +3398,7 @@ void perf_event_update_userpage(struct perf_event *event) userpg->time_running = running + atomic64_read(&event->child_total_time_running); - perf_update_user_clock(userpg, now); + arch_perf_update_userpage(userpg, now); barrier(); ++userpg->lock; -- cgit v1.2.3 From 307b1cd7ecd7f3dc5ce3d3860957f034f0abe4df Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Fri, 23 Mar 2012 15:02:03 -0700 Subject: bitops: rename for_each_set_bit_cont() in favor of analogous list.h function This renames for_each_set_bit_cont() to for_each_set_bit_from() because it is analogous to list_for_each_entry_from() in list.h rather than list_for_each_entry_continue(). This doesn't remove for_each_set_bit_cont() for now. Signed-off-by: Akinobu Mita Cc: Robert Richter Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/perf_event.c | 4 ++-- include/linux/bitops.h | 5 ++++- tools/perf/util/include/linux/bitops.h | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel/cpu/perf_event.c') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 0a18d16cb58d..fa2900c0e398 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -643,14 +643,14 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) /* Prefer fixed purpose counters */ if (x86_pmu.num_counters_fixed) { idx = X86_PMC_IDX_FIXED; - for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) { + for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { if (!__test_and_set_bit(idx, sched->state.used)) goto done; } } /* Grab the first unused counter starting with idx */ idx = sched->state.counter; - for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) { + for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_FIXED) { if (!__test_and_set_bit(idx, sched->state.used)) goto done; } diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 94300fe46cce..a78e358f0c17 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -27,11 +27,14 @@ extern unsigned long __sw_hweight64(__u64 w); (bit) = find_next_bit((addr), (size), (bit) + 1)) /* same as for_each_set_bit() but use bit as value to start with */ -#define for_each_set_bit_cont(bit, addr, size) \ +#define for_each_set_bit_from(bit, addr, size) \ for ((bit) = find_next_bit((addr), (size), (bit)); \ (bit) < (size); \ (bit) = find_next_bit((addr), (size), (bit) + 1)) +#define for_each_set_bit_cont(bit, addr, size) \ + for_each_set_bit_from(bit, addr, size) + static __inline__ int get_bitmask_order(unsigned int count) { int order; diff --git a/tools/perf/util/include/linux/bitops.h b/tools/perf/util/include/linux/bitops.h index 62cdee78db7b..f1584833bd22 100644 --- a/tools/perf/util/include/linux/bitops.h +++ b/tools/perf/util/include/linux/bitops.h @@ -15,7 +15,7 @@ (bit) = find_next_bit((addr), (size), (bit) + 1)) /* same as for_each_set_bit() but use bit as value to start with */ -#define for_each_set_bit_cont(bit, addr, size) \ +#define for_each_set_bit_from(bit, addr, size) \ for ((bit) = find_next_bit((addr), (size), (bit)); \ (bit) < (size); \ (bit) = find_next_bit((addr), (size), (bit) + 1)) -- cgit v1.2.3