diff options
Diffstat (limited to 'arch')
43 files changed, 2371 insertions, 37 deletions
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 6bd91ed7cd03..ad604df6a2b6 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -505,7 +505,7 @@ config COMPAT_FOR_U64_ALIGNMENT config IA64_MCA_RECOVERY tristate "MCA recovery from errors other than TLB." -config PERFMON +config PERFMON_V20 bool "Performance monitor support" help Selects whether support for the IA-64 performance monitor hardware diff --git a/arch/ia64/configs/bigsur_defconfig b/arch/ia64/configs/bigsur_defconfig index 6dd8655664f3..2c04fbe6c414 100644 --- a/arch/ia64/configs/bigsur_defconfig +++ b/arch/ia64/configs/bigsur_defconfig @@ -134,7 +134,7 @@ CONFIG_ARCH_SPARSEMEM_ENABLE=y CONFIG_IA32_SUPPORT=y CONFIG_COMPAT=y # CONFIG_IA64_MCA_RECOVERY is not set -CONFIG_PERFMON=y +CONFIG_PERFMON_V20=y CONFIG_IA64_PALINFO=y # diff --git a/arch/ia64/configs/generic_defconfig b/arch/ia64/configs/generic_defconfig index e05f9e1d3faa..7d89a19fc8b3 100644 --- a/arch/ia64/configs/generic_defconfig +++ b/arch/ia64/configs/generic_defconfig @@ -209,7 +209,7 @@ CONFIG_IA32_SUPPORT=y CONFIG_COMPAT=y CONFIG_COMPAT_FOR_U64_ALIGNMENT=y CONFIG_IA64_MCA_RECOVERY=y -CONFIG_PERFMON=y +CONFIG_PERFMON_V20=y CONFIG_IA64_PALINFO=y # CONFIG_IA64_MC_ERR_INJECT is not set CONFIG_SGI_SN=y diff --git a/arch/ia64/configs/gensparse_defconfig b/arch/ia64/configs/gensparse_defconfig index e86fbd39c795..5f8c7721e29a 100644 --- a/arch/ia64/configs/gensparse_defconfig +++ b/arch/ia64/configs/gensparse_defconfig @@ -142,7 +142,7 @@ CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y CONFIG_IA32_SUPPORT=y CONFIG_COMPAT=y CONFIG_IA64_MCA_RECOVERY=y -CONFIG_PERFMON=y +CONFIG_PERFMON_V20=y CONFIG_IA64_PALINFO=y CONFIG_SGI_SN=y diff --git a/arch/ia64/configs/sim_defconfig b/arch/ia64/configs/sim_defconfig index 546a772f438e..d51457af7ca6 100644 --- a/arch/ia64/configs/sim_defconfig +++ b/arch/ia64/configs/sim_defconfig @@ -133,7 +133,7 @@ CONFIG_ARCH_SPARSEMEM_ENABLE=y CONFIG_IA32_SUPPORT=y CONFIG_COMPAT=y # CONFIG_IA64_MCA_RECOVERY is not set -# CONFIG_PERFMON is not set +# CONFIG_PERFMON_V20 is not set CONFIG_IA64_PALINFO=m # diff --git a/arch/ia64/configs/tiger_defconfig b/arch/ia64/configs/tiger_defconfig index c522edf23c62..318d846ab253 100644 --- a/arch/ia64/configs/tiger_defconfig +++ b/arch/ia64/configs/tiger_defconfig @@ -156,7 +156,7 @@ CONFIG_VIRTUAL_MEM_MAP=y CONFIG_HOLES_IN_ZONE=y # CONFIG_IA32_SUPPORT is not set CONFIG_IA64_MCA_RECOVERY=y -CONFIG_PERFMON=y +CONFIG_PERFMON_V20=y CONFIG_IA64_PALINFO=y # CONFIG_IA64_MC_ERR_INJECT is not set # CONFIG_IA64_ESI is not set diff --git a/arch/ia64/configs/zx1_defconfig b/arch/ia64/configs/zx1_defconfig index 0a06b1333c95..2bf0ad40398f 100644 --- a/arch/ia64/configs/zx1_defconfig +++ b/arch/ia64/configs/zx1_defconfig @@ -153,7 +153,7 @@ CONFIG_HOLES_IN_ZONE=y CONFIG_IA32_SUPPORT=y CONFIG_COMPAT=y CONFIG_IA64_MCA_RECOVERY=y -CONFIG_PERFMON=y +CONFIG_PERFMON_V20=y CONFIG_IA64_PALINFO=y # CONFIG_IA64_ESI is not set # CONFIG_KEXEC is not set diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h index f88fa054d01d..3ecf7e0b44cb 100644 --- a/arch/ia64/include/asm/processor.h +++ b/arch/ia64/include/asm/processor.h @@ -321,7 +321,7 @@ struct thread_struct { #else # define INIT_THREAD_IA32 #endif /* CONFIG_IA32_SUPPORT */ -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 void *pfm_context; /* pointer to detailed PMU context */ unsigned long pfm_needs_checking; /* when >0, pending perfmon work on kernel exit */ # define INIT_THREAD_PM .pfm_context = NULL, \ diff --git a/arch/ia64/include/asm/system.h b/arch/ia64/include/asm/system.h index 927a381c20ca..387e54030af1 100644 --- a/arch/ia64/include/asm/system.h +++ b/arch/ia64/include/asm/system.h @@ -224,7 +224,7 @@ extern void ia64_account_on_switch (struct task_struct *prev, struct task_struct # define IA64_ACCOUNT_ON_SWITCH(p,n) #endif -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 DECLARE_PER_CPU(unsigned long, pfm_syst_info); # define PERFMON_IS_SYSWIDE() (__get_cpu_var(pfm_syst_info) & 0x1) #else diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index c381ea954892..93819cca7d96 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -23,7 +23,7 @@ obj-$(CONFIG_IOSAPIC) += iosapic.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_SMP) += smp.o smpboot.o obj-$(CONFIG_NUMA) += numa.o -obj-$(CONFIG_PERFMON) += perfmon_default_smpl.o +obj-$(CONFIG_PERFMON_V20) += perfmon_default_smpl.o obj-$(CONFIG_IA64_CYCLONE) += cyclone.o obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c index 28d3d483db92..db54bd497cf6 100644 --- a/arch/ia64/kernel/irq_ia64.c +++ b/arch/ia64/kernel/irq_ia64.c @@ -40,7 +40,7 @@ #include <asm/system.h> #include <asm/tlbflush.h> -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 # include <asm/perfmon.h> #endif @@ -660,7 +660,7 @@ init_IRQ (void) } #endif #endif -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 pfm_init_percpu(); #endif platform_irq_init(); diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 0e499757309b..5f6efcfa2de4 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -52,7 +52,7 @@ #include <asm/uaccess.h> #include <asm/delay.h> -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 /* * perfmon context state */ @@ -6831,10 +6831,10 @@ pfm_inherit(struct task_struct *task, struct pt_regs *regs) * the psr bits are already set properly in copy_threads() */ } -#else /* !CONFIG_PERFMON */ +#else /* !CONFIG_PERFMON_v20 */ asmlinkage long sys_perfmonctl (int fd, int cmd, void *arg, int count) { return -ENOSYS; } -#endif /* CONFIG_PERFMON */ +#endif /* CONFIG_PERFMON_V20 */ diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index c57162705147..afbf1a8205ee 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -46,7 +46,7 @@ #include "entry.h" -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 # include <asm/perfmon.h> #endif @@ -174,7 +174,7 @@ do_notify_resume_user(sigset_t *unused, struct sigscratch *scr, long in_syscall) return; } -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 if (current->thread.pfm_needs_checking) /* * Note: pfm_handle_work() allow us to call it with interrupts @@ -334,14 +334,14 @@ cpu_idle (void) void ia64_save_extra (struct task_struct *task) { -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 unsigned long info; #endif if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0) ia64_save_debug_regs(&task->thread.dbr[0]); -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0) pfm_save_regs(task); @@ -359,14 +359,14 @@ ia64_save_extra (struct task_struct *task) void ia64_load_extra (struct task_struct *task) { -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 unsigned long info; #endif if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0) ia64_load_debug_regs(&task->thread.dbr[0]); -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0) pfm_load_regs(task); @@ -523,7 +523,7 @@ copy_thread (int nr, unsigned long clone_flags, } #endif -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 if (current->thread.pfm_context) pfm_inherit(p, child_ptregs); #endif @@ -735,7 +735,7 @@ exit_thread (void) { ia64_drop_fpu(current); -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 /* if needed, stop monitoring and flush state to perfmon context */ if (current->thread.pfm_context) pfm_exit_thread(current); diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index 92c9689b7d97..ffd212fd2d36 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -31,7 +31,7 @@ #include <asm/system.h> #include <asm/uaccess.h> #include <asm/unwind.h> -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 #include <asm/perfmon.h> #endif @@ -2105,7 +2105,7 @@ access_uarea(struct task_struct *child, unsigned long addr, "address 0x%lx\n", addr); return -1; } -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 /* * Check if debug registers are used by perfmon. This * test must be done once we know that we can do the diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c index 1dcbb85fc4ee..f865315a9248 100644 --- a/arch/ia64/kernel/smpboot.c +++ b/arch/ia64/kernel/smpboot.c @@ -381,7 +381,7 @@ smp_callin (void) extern void ia64_init_itm(void); extern volatile int time_keeper_id; -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 extern void pfm_init_percpu(void); #endif @@ -411,7 +411,7 @@ smp_callin (void) ia64_mca_cmc_vector_setup(); /* Setup vector on AP */ -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 pfm_init_percpu(); #endif diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile index 98771e2a78af..754f4153123e 100644 --- a/arch/ia64/lib/Makefile +++ b/arch/ia64/lib/Makefile @@ -13,7 +13,7 @@ lib-y := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \ obj-$(CONFIG_ITANIUM) += copy_page.o copy_user.o memcpy.o obj-$(CONFIG_MCKINLEY) += copy_page_mck.o memcpy_mck.o -lib-$(CONFIG_PERFMON) += carta_random.o +lib-$(CONFIG_PERFMON_V20) += carta_random.o AFLAGS___divdi3.o = AFLAGS___udivdi3.o = -DUNSIGNED diff --git a/arch/ia64/oprofile/Makefile b/arch/ia64/oprofile/Makefile index aad27a718ee0..3323fd5a46e9 100644 --- a/arch/ia64/oprofile/Makefile +++ b/arch/ia64/oprofile/Makefile @@ -7,4 +7,4 @@ DRIVER_OBJS := $(addprefix ../../../drivers/oprofile/, \ timer_int.o ) oprofile-y := $(DRIVER_OBJS) init.o backtrace.o -oprofile-$(CONFIG_PERFMON) += perfmon.o +oprofile-$(CONFIG_PERFMON_V20) += perfmon.o diff --git a/arch/ia64/oprofile/init.c b/arch/ia64/oprofile/init.c index 31b545c35460..9ed2bc152fba 100644 --- a/arch/ia64/oprofile/init.c +++ b/arch/ia64/oprofile/init.c @@ -20,7 +20,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops) { int ret = -ENODEV; -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 /* perfmon_init() can fail, but we have no way to report it */ ret = perfmon_init(ops); #endif @@ -32,7 +32,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops) void oprofile_arch_exit(void) { -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 perfmon_exit(); #endif } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b5e714373385..cdc53491c033 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1516,6 +1516,8 @@ config CMDLINE_OVERRIDE This is used to work around broken boot loaders. This should be set to 'N' under normal conditions. +source "arch/x86/perfmon/Kconfig" + endmenu config ARCH_ENABLE_MEMORY_HOTPLUG diff --git a/arch/x86/Makefile b/arch/x86/Makefile index cf72b569db41..f3af2b0b4f15 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -155,6 +155,9 @@ core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/ core-y += arch/x86/kernel/ core-y += arch/x86/mm/ +# perfmon support +core-$(CONFIG_PERFMON) += arch/x86/perfmon/ + # Remaining sub architecture files core-y += $(mcore-y) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 256b00b61892..891af3e6b3a6 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -826,4 +826,9 @@ ia32_sys_call_table: .quad sys_dup3 /* 330 */ .quad sys_pipe2 .quad sys_inotify_init1 + .quad sys_pfm_create + .quad sys_pfm_write + .quad sys_pfm_read /* 335 */ + .quad sys_pfm_attach + .quad sys_pfm_set_state ia32_syscall_end: diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 4a8e80cdcfa5..15d495f73485 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -10,6 +10,7 @@ header-y += ptrace-abi.h header-y += sigcontext32.h header-y += ucontext.h header-y += processor-flags.h +header-y += perfmon.h unifdef-y += e820.h unifdef-y += ist.h diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 0005adb0f941..0ba6dd3aa24e 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -87,6 +87,11 @@ #define LOCAL_TIMER_VECTOR 0xef /* + * Perfmon PMU interrupt vector + */ +#define LOCAL_PERFMON_VECTOR 0xee + +/* * First APIC vector available to drivers: (vectors 0x30-0xee) we * start at 0x31(0x41) to spread out vectors evenly between priority * levels. (0x80 is the syscall vector) diff --git a/arch/x86/include/asm/mach-default/entry_arch.h b/arch/x86/include/asm/mach-default/entry_arch.h index 6b1add8e31dd..e940722dc1f0 100644 --- a/arch/x86/include/asm/mach-default/entry_arch.h +++ b/arch/x86/include/asm/mach-default/entry_arch.h @@ -33,4 +33,8 @@ BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) #endif +#ifdef CONFIG_PERFMON +BUILD_INTERRUPT(pmu_interrupt,LOCAL_PERFMON_VECTOR) +#endif + #endif diff --git a/arch/x86/include/asm/perfmon.h b/arch/x86/include/asm/perfmon.h new file mode 100644 index 000000000000..906f4b24cf0c --- /dev/null +++ b/arch/x86/include/asm/perfmon.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@hpl.hp.com> + * + * This file contains i386/x86_64 specific definitions for the perfmon + * interface. + * + * This file MUST never be included directly. Use linux/perfmon.h. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef _ASM_X86_PERFMON__H_ +#define _ASM_X86_PERFMON__H_ + +/* + * arch-specific user visible interface definitions + */ + +#define PFM_ARCH_MAX_PMCS (256+64) /* 256 HW 64 SW */ +#define PFM_ARCH_MAX_PMDS (256+64) /* 256 HW 64 SW */ + +#endif /* _ASM_X86_PERFMON_H_ */ diff --git a/arch/x86/include/asm/perfmon_kern.h b/arch/x86/include/asm/perfmon_kern.h new file mode 100644 index 000000000000..7cadbb894e83 --- /dev/null +++ b/arch/x86/include/asm/perfmon_kern.h @@ -0,0 +1,438 @@ +/* + * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@hpl.hp.com> + * + * Copyright (c) 2007 Advanced Micro Devices, Inc. + * Contributed by Robert Richter <robert.richter@amd.com> + * + * This file contains X86 Processor Family specific definitions + * for the perfmon interface. This covers P6, Pentium M, P4/Xeon + * (32-bit and 64-bit, i.e., EM64T) and AMD X86-64. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef _ASM_X86_PERFMON_KERN_H_ +#define _ASM_X86_PERFMON_KERN_H_ + +#ifdef CONFIG_PERFMON +#include <linux/unistd.h> +#ifdef CONFIG_4KSTACKS +#define PFM_ARCH_STK_ARG 8 +#else +#define PFM_ARCH_STK_ARG 16 +#endif + +struct pfm_arch_pmu_info { + u32 flags; /* PMU feature flags */ + /* + * mandatory model-specific callbacks + */ + int (*stop_save)(struct pfm_context *ctx, struct pfm_event_set *set); + int (*has_ovfls)(struct pfm_context *ctx); + void (*quiesce)(void); + + /* + * optional model-specific callbacks + */ + void (*acquire_pmu_percpu)(void); + void (*release_pmu_percpu)(void); + int (*load_context)(struct pfm_context *ctx); + void (*unload_context)(struct pfm_context *ctx); +}; + +/* + * PMU feature flags + */ +#define PFM_X86_FL_NO_SHARING 0x02 /* no sharing with other subsystems */ +#define PFM_X86_FL_SHARING 0x04 /* PMU is being shared */ + +struct pfm_x86_ctx_flags { + unsigned int insecure:1; /* rdpmc per-thread self-monitoring */ + unsigned int reserved:31; /* for future use */ +}; + +struct pfm_arch_context { + u64 saved_real_iip; /* instr pointer of last NMI intr */ + struct pfm_x86_ctx_flags flags; /* flags */ + int saved_started; +}; + +/* + * functions implemented as inline on x86 + */ + +/** + * pfm_arch_write_pmc - write a single PMC register + * @ctx: context to work on + * @cnum: PMC index + * @value: PMC 64-bit value + * + * in certain situations, ctx may be NULL + */ +static inline void pfm_arch_write_pmc(struct pfm_context *ctx, + unsigned int cnum, u64 value) +{ + /* + * we only write to the actual register when monitoring is + * active (pfm_start was issued) + */ + if (ctx && ctx->flags.started == 0) + return; + + PFM_DBG_ovfl("pfm_arch_write_pmc(0x%lx, 0x%Lx)", + pfm_pmu_conf->pmc_desc[cnum].hw_addr, + (unsigned long long) value); + + wrmsrl(pfm_pmu_conf->pmc_desc[cnum].hw_addr, value); +} + +/** + * pfm_arch_write_pmd - write a single PMD register + * @ctx: context to work on + * @cnum: PMD index + * @value: PMD 64-bit value + */ +static inline void pfm_arch_write_pmd(struct pfm_context *ctx, + unsigned int cnum, u64 value) +{ + /* + * to make sure the counter overflows, we set the + * upper bits. we also clear any other unimplemented + * bits as this may cause crash on some processors. + */ + if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_C64) + value = (value | ~pfm_pmu_conf->ovfl_mask) + & ~pfm_pmu_conf->pmd_desc[cnum].rsvd_msk; + + PFM_DBG_ovfl("pfm_arch_write_pmd(0x%lx, 0x%Lx)", + pfm_pmu_conf->pmd_desc[cnum].hw_addr, + (unsigned long long) value); + + wrmsrl(pfm_pmu_conf->pmd_desc[cnum].hw_addr, value); +} + +/** + * pfm_arch_read_pmd - read a single PMD register + * @ctx: context to work on + * @cnum: PMD index + * + * return value is register 64-bit value + */ +static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum) +{ + u64 tmp; + + rdmsrl(pfm_pmu_conf->pmd_desc[cnum].hw_addr, tmp); + + PFM_DBG_ovfl("pfm_arch_read_pmd(0x%lx) = 0x%Lx", + pfm_pmu_conf->pmd_desc[cnum].hw_addr, + (unsigned long long) tmp); + return tmp; +} + +/** + * pfm_arch_read_pmc - read a single PMC register + * @ctx: context to work on + * @cnum: PMC index + * + * return value is register 64-bit value + */ +static inline u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum) +{ + u64 tmp; + + rdmsrl(pfm_pmu_conf->pmc_desc[cnum].hw_addr, tmp); + + PFM_DBG_ovfl("pfm_arch_read_pmc(0x%lx) = 0x%016Lx", + pfm_pmu_conf->pmc_desc[cnum].hw_addr, + (unsigned long long) tmp); + return tmp; +} + +/** + * pfm_arch_is_active - return non-zero is monitoring has been started + * @ctx: context to check + * + * At certain points, perfmon needs to know if monitoring has been + * explicitly started. + * + * On x86, there is not other way but to use pfm_start/pfm_stop + * to activate monitoring, thus we can simply check flags.started + */ +static inline int pfm_arch_is_active(struct pfm_context *ctx) +{ + return ctx->flags.started; +} + + +/** + * pfm_arch_unload_context - detach context from thread or CPU + * @ctx: context to detach + * + * in system-wide ctx->task is NULL, otherwise it points to the + * attached thread + */ +static inline void pfm_arch_unload_context(struct pfm_context *ctx) +{ + struct pfm_arch_pmu_info *pmu_info; + struct pfm_arch_context *ctx_arch; + + ctx_arch = pfm_ctx_arch(ctx); + pmu_info = pfm_pmu_info(); + + if (ctx_arch->flags.insecure) { + PFM_DBG("clear cr4.pce"); + clear_in_cr4(X86_CR4_PCE); + } + + if (pmu_info->unload_context) + pmu_info->unload_context(ctx); +} + +/** + * pfm_arch_load_context - attach context to thread or CPU + * @ctx: context to attach + */ +static inline int pfm_arch_load_context(struct pfm_context *ctx) +{ + struct pfm_arch_pmu_info *pmu_info; + struct pfm_arch_context *ctx_arch; + int ret = 0; + + ctx_arch = pfm_ctx_arch(ctx); + pmu_info = pfm_pmu_info(); + + /* + * RDPMC authorized in system-wide and + * per-thread self-monitoring. + * + * RDPMC only gives access to counts. + * + * The context-switch routine code does not restore + * all the PMD registers (optimization), thus there + * is a possible leak of counts there in per-thread + * mode. + */ + if (ctx->task == current) { + PFM_DBG("set cr4.pce"); + set_in_cr4(X86_CR4_PCE); + ctx_arch->flags.insecure = 1; + } + + if (pmu_info->load_context) + ret = pmu_info->load_context(ctx); + + return ret; +} + +void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set); +void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx); +void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx); + +/** + * pfm_arch_intr_freeze_pmu - stop monitoring when handling PMU interrupt + * @ctx: current context + * @set: current event set + * + * called from __pfm_interrupt_handler(). + * ctx is not NULL. ctx is locked. interrupts are masked + * + * The following actions must take place: + * - stop all monitoring to ensure handler has consistent view. + * - collect overflowed PMDs bitmask into povfls_pmds and + * npend_ovfls. If no interrupt detected then npend_ovfls + * must be set to zero. + */ +static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + struct pfm_arch_context *ctx_arch; + ctx_arch = pfm_ctx_arch(ctx); + /* + * on X86, freezing is equivalent to stopping + */ + pfm_arch_stop(current, ctx); + + /* + * we mark monitoring as stopped to avoid + * certain side effects especially in + * pfm_arch_restore_pmcs() + */ + ctx_arch->saved_started = ctx->flags.started; + ctx->flags.started = 0; +} + +/** + * pfm_arch_intr_unfreeze_pmu - conditionally reactive monitoring + * @ctx: current context + * + * current context may be not when dealing when spurious interrupts + * + * Must re-activate monitoring if context is not MASKED. + * interrupts are masked. + */ +static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx) +{ + struct pfm_arch_context *ctx_arch; + + if (ctx == NULL) + return; + + ctx_arch = pfm_ctx_arch(ctx); + + PFM_DBG_ovfl("state=%d", ctx->state); + + /* + * restore flags.started which is cleared in + * pfm_arch_intr_freeze_pmu() + */ + ctx->flags.started = ctx_arch->saved_started; + + pfm_arch_restore_pmcs(ctx, ctx->active_set); +} + +/** + * pfm_arch_ovfl_reset_pmd - reset pmd on overflow + * @ctx: current context + * @cnum: PMD index + * + * On some CPUs, the upper bits of a counter must be set in order for the + * overflow interrupt to happen. On overflow, the counter has wrapped around, + * and the upper bits are cleared. This function may be used to set them back. + * + * For x86, the current version loses whatever is remaining in the counter, + * which is usually has a small count. In order not to loose this count, + * we do a read-modify-write to set the upper bits while preserving the + * low-order bits. This is slow but works. + */ +static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx, unsigned int cnum) +{ + u64 val; + val = pfm_arch_read_pmd(ctx, cnum); + pfm_arch_write_pmd(ctx, cnum, val); +} + +/** + * pfm_arch_context_create - create context + * @ctx: newly created context + * @flags: context flags as passed by user + * + * called from __pfm_create_context() + */ +static inline int pfm_arch_context_create(struct pfm_context *ctx, u32 ctx_flags) +{ + return 0; +} + +/** + * pfm_arch_context_free - free context + * @ctx: context to free + */ +static inline void pfm_arch_context_free(struct pfm_context *ctx) +{} + +/* + * functions implemented in arch/x86/perfmon/perfmon.c + */ +int pfm_arch_init(void); +void pfm_arch_resend_irq(struct pfm_context *ctx); + +int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx); +void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx); + +void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set); +int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg); +void pfm_arch_pmu_config_remove(void); +char *pfm_arch_get_pmu_module_name(void); +int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds); +void pfm_arch_pmu_release(void); + +static inline void pfm_arch_serialize(void) +{} + +static inline void pfm_arch_arm_handle_work(struct task_struct *task) +{} + +static inline void pfm_arch_disarm_handle_work(struct task_struct *task) +{} + +#define PFM_ARCH_CTX_SIZE (sizeof(struct pfm_arch_context)) +/* + * x86 does not need extra alignment requirements for the sampling buffer + */ +#define PFM_ARCH_SMPL_ALIGN_SIZE 0 + +asmlinkage void pmu_interrupt(void); + +static inline void pfm_arch_bv_copy(u64 *a, u64 *b, int nbits) +{ + bitmap_copy((unsigned long *)a, + (unsigned long *)b, + nbits); +} + +static inline void pfm_arch_bv_or(u64 *a, u64 *b, u64 *c, int nbits) +{ + bitmap_or((unsigned long *)a, + (unsigned long *)b, + (unsigned long *)c, + nbits); +} + +static inline void pfm_arch_bv_and(u64 *a, u64 *b, u64 *c, int nbits) +{ + bitmap_and((unsigned long *)a, + (unsigned long *)b, + (unsigned long *)c, + nbits); +} + + +static inline void pfm_arch_bv_zero(u64 *a, int nbits) +{ + bitmap_zero((unsigned long *)a, nbits); +} + +static inline int pfm_arch_bv_weight(u64 *a, int nbits) +{ + return bitmap_weight((unsigned long *)a, nbits); +} + +static inline void pfm_arch_bv_set_bit(int b, u64 *a) +{ + __set_bit(b, (unsigned long *)a); +} + +static inline void pfm_arch_bv_clear_bit(int b, u64 *a) +{ + __clear_bit(b, (unsigned long *)a); +} + +static inline int pfm_arch_bv_test_bit(int b, u64 *a) +{ + return test_bit(b, (unsigned long *)a); +} + +static inline unsigned long pfm_arch_bv_find_next_bit(const u64 *addr, + unsigned long size, + unsigned long offset) +{ + return find_next_bit((unsigned long *)addr, + size, + offset); +} +#endif /* CONFIG_PEFMON */ + +#endif /* _ASM_X86_PERFMON_KERN_H_ */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e44d379faad2..0ddd534bef44 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -79,6 +79,7 @@ struct thread_info { #define TIF_SYSCALL_EMU 6 /* syscall emulation active */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ +#define TIF_PERFMON_WORK 9 /* work for pfm_handle_work() */ #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* 32bit process */ @@ -92,6 +93,7 @@ struct thread_info { #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ #define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */ +#define TIF_PERFMON_CTXSW 28 /* perfmon needs ctxsw calls */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -114,6 +116,8 @@ struct thread_info { #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) #define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS) +#define _TIF_PERFMON_WORK (1<<TIF_PERFMON_WORK) +#define _TIF_PERFMON_CTXSW (1<<TIF_PERFMON_CTXSW) /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ @@ -135,12 +139,12 @@ struct thread_info { /* Only used for 64 bit */ #define _TIF_DO_NOTIFY_MASK \ - (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME) + (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERFMON_WORK) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \ - _TIF_NOTSC) + _TIF_NOTSC|_TIF_PERFMON_CTXSW) #define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index f2bba78430a4..06908451002f 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -338,6 +338,11 @@ #define __NR_dup3 330 #define __NR_pipe2 331 #define __NR_inotify_init1 332 +#define __NR_pfm_create 333 +#define __NR_pfm_write (__NR_pfm_create+1) +#define __NR_pfm_read (__NR_pfm_create+2) +#define __NR_pfm_attach (__NR_pfm_create+3) +#define __NR_pfm_set_state (__NR_pfm_create+4) #ifdef __KERNEL__ diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 834b2c1d89fb..a42bb5eb9edb 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -653,7 +653,16 @@ __SYSCALL(__NR_dup3, sys_dup3) __SYSCALL(__NR_pipe2, sys_pipe2) #define __NR_inotify_init1 294 __SYSCALL(__NR_inotify_init1, sys_inotify_init1) - +#define __NR_pfm_create 295 +__SYSCALL(__NR_pfm_create, sys_pfm_create) +#define __NR_pfm_write (__NR_pfm_create+1) +__SYSCALL(__NR_pfm_write, sys_pfm_write) +#define __NR_pfm_read (__NR_pfm_create+2) + __SYSCALL(__NR_pfm_read, sys_pfm_read) +#define __NR_pfm_attach (__NR_pfm_create+3) +__SYSCALL(__NR_pfm_attach, sys_pfm_attach) +#define __NR_pfm_set_state (__NR_pfm_create+4) +__SYSCALL(__NR_pfm_set_state, sys_pfm_set_state) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 9134de814c97..9f8826f33032 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -513,7 +513,7 @@ ENDPROC(system_call) ALIGN RING0_PTREGS_FRAME # can't unwind into user space anyway work_pending: - testb $_TIF_NEED_RESCHED, %cl + testw $(_TIF_NEED_RESCHED|_TIF_PERFMON_WORK), %cx jz work_notifysig work_resched: call schedule diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 983d85aeccce..1d9bef0797d9 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -876,7 +876,13 @@ END(error_interrupt) ENTRY(spurious_interrupt) apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt END(spurious_interrupt) - + +#ifdef CONFIG_PERFMON +ENTRY(pmu_interrupt) + apicinterrupt LOCAL_PERFMON_VECTOR,smp_pmu_interrupt +END(pmu_interrupt) +#endif + /* * Exception entry points. */ diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index ff0235391285..24a0140e6c36 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -11,6 +11,7 @@ #include <linux/kernel_stat.h> #include <linux/sysdev.h> #include <linux/bitops.h> +#include <linux/perfmon_kern.h> #include <asm/acpi.h> #include <asm/atomic.h> @@ -224,6 +225,10 @@ void __init native_init_IRQ(void) apic_intr_init(); +#ifdef CONFIG_PERFMON + alloc_intr_gate(LOCAL_PERFMON_VECTOR, pmu_interrupt); +#endif + if (!acpi_ioapic) setup_irq(2, &irq2); } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0a1302fe6d45..7ff71d4d6d9b 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -36,6 +36,7 @@ #include <linux/personality.h> #include <linux/tick.h> #include <linux/percpu.h> +#include <linux/perfmon_kern.h> #include <linux/prctl.h> #include <linux/dmi.h> @@ -258,6 +259,7 @@ void exit_thread(void) ds_free(current->thread.ds_ctx); } #endif /* CONFIG_X86_DS */ + pfm_exit_thread(); } void flush_thread(void) @@ -315,6 +317,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, savesegment(gs, p->thread.gs); + pfm_copy_thread(p); + tsk = current; if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, @@ -458,11 +462,17 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, prev = &prev_p->thread; next = &next_p->thread; + if (test_tsk_thread_flag(prev_p, TIF_PERFMON_CTXSW)) + pfm_ctxsw_out(prev_p, next_p); + debugctl = update_debugctl(prev, next, prev->debugctlmsr); if (next->debugctlmsr != debugctl) update_debugctlmsr(next->debugctlmsr); + if (test_tsk_thread_flag(next_p, TIF_PERFMON_CTXSW)) + pfm_ctxsw_in(prev_p, next_p); + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { set_debugreg(next->debugreg0, 0); set_debugreg(next->debugreg1, 1); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3180e79c3697..86099f98104a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -37,6 +37,7 @@ #include <linux/kprobes.h> #include <linux/kdebug.h> #include <linux/tick.h> +#include <linux/perfmon_kern.h> #include <linux/prctl.h> #include <linux/uaccess.h> #include <linux/io.h> @@ -255,6 +256,7 @@ void exit_thread(void) ds_free(t->ds_ctx); } #endif /* CONFIG_X86_DS */ + pfm_exit_thread(); } void flush_thread(void) @@ -359,6 +361,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); + pfm_copy_thread(p); + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { @@ -487,6 +491,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, prev = &prev_p->thread, next = &next_p->thread; + if (test_tsk_thread_flag(prev_p, TIF_PERFMON_CTXSW)) + pfm_ctxsw_out(prev_p, next_p); + debugctl = prev->debugctlmsr; #ifdef CONFIG_X86_DS @@ -513,6 +520,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, if (next->debugctlmsr != debugctl) update_debugctlmsr(next->debugctlmsr); + if (test_tsk_thread_flag(next_p, TIF_PERFMON_CTXSW)) + pfm_ctxsw_in(prev_p, next_p); + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { loaddebug(next, 0); loaddebug(next, 1); diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 27a5c8174322..7d6fc603dea7 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -19,6 +19,7 @@ #include <linux/wait.h> #include <linux/tracehook.h> #include <linux/elf.h> +#include <linux/perfmon_kern.h> #include <linux/smp.h> #include <linux/mm.h> @@ -749,6 +750,10 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) mce_notify_user(); #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ + /* process perfmon asynchronous work (e.g. block thread or reset) */ + if (thread_info_flags & _TIF_PERFMON_WORK) + pfm_handle_work(regs); + /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index d2307e41fbdb..24e389836fc0 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -21,6 +21,7 @@ #include <linux/personality.h> #include <linux/compiler.h> #include <linux/uaccess.h> +#include <linux/perfmon_kern.h> #include <asm/processor.h> #include <asm/ucontext.h> @@ -538,6 +539,10 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) mce_notify_user(); #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ + /* process perfmon asynchronous work (e.g. block thread or reset) */ + if (thread_info_flags & _TIF_PERFMON_WORK) + pfm_handle_work(regs); + /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index d44395ff34c3..81c22739f70b 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -332,3 +332,8 @@ ENTRY(sys_call_table) .long sys_dup3 /* 330 */ .long sys_pipe2 .long sys_inotify_init1 + .long sys_pfm_create + .long sys_pfm_write + .long sys_pfm_read /* 335 */ + .long sys_pfm_attach + .long sys_pfm_set_state diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 022cd41ea9b4..584a9ef4e44c 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -17,6 +17,7 @@ #include <linux/moduleparam.h> #include <linux/kdebug.h> #include <linux/cpu.h> +#include <linux/perfmon_kern.h> #include <asm/nmi.h> #include <asm/msr.h> #include <asm/apic.h> @@ -142,12 +143,18 @@ static int nmi_setup(void) int err = 0; int cpu; - if (!allocate_msrs()) + if (pfm_session_allcpus_acquire()) + return -EBUSY; + + if (!allocate_msrs()) { + pfm_session_allcpus_release(); return -ENOMEM; + } err = register_die_notifier(&profile_exceptions_nb); if (err) { free_msrs(); + pfm_session_allcpus_release(); return err; } @@ -228,6 +235,7 @@ static void nmi_shutdown(void) msrs = &get_cpu_var(cpu_msrs); model->shutdown(msrs); free_msrs(); + pfm_session_allcpus_release(); put_cpu_var(cpu_msrs); } diff --git a/arch/x86/perfmon/Kconfig b/arch/x86/perfmon/Kconfig new file mode 100644 index 000000000000..8144d1d0d600 --- /dev/null +++ b/arch/x86/perfmon/Kconfig @@ -0,0 +1,33 @@ +menu "Hardware Performance Monitoring support" +config PERFMON + bool "Perfmon2 performance monitoring interface" + select X86_LOCAL_APIC + default n + help + Enables the perfmon2 interface to access the hardware + performance counters. See <http://perfmon2.sf.net/> for + more details. + +config PERFMON_DEBUG + bool "Perfmon debugging" + default n + depends on PERFMON + help + Enables perfmon debugging support + +config X86_PERFMON_INTEL_ARCH + bool "Support for Intel architectural perfmon v1/v2/v3" + depends on PERFMON + default n + help + Enables support for Intel architectural performance counters. + This feature was introduced with Intel Core Solo/Core Duo processors. + +config X86_PERFMON_AMD64 + bool "Support AMD Athlon/Opteron hardware performance counters" + depends on PERFMON + default n + help + Enables support for Athlon/Opterton hardware performance counters. + Support for family 6, 15 and 16 processors. + endmenu diff --git a/arch/x86/perfmon/Makefile b/arch/x86/perfmon/Makefile new file mode 100644 index 000000000000..c0a4ca0da329 --- /dev/null +++ b/arch/x86/perfmon/Makefile @@ -0,0 +1,7 @@ +# +# Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P. +# Contributed by Stephane Eranian <eranian@hpl.hp.com> +# +obj-$(CONFIG_PERFMON) += perfmon.o +obj-$(CONFIG_X86_PERFMON_INTEL_ARCH) += perfmon_intel_arch.o +obj-$(CONFIG_X86_PERFMON_AMD64) += perfmon_amd64.o diff --git a/arch/x86/perfmon/perfmon.c b/arch/x86/perfmon/perfmon.c new file mode 100644 index 000000000000..844f19dc6cb0 --- /dev/null +++ b/arch/x86/perfmon/perfmon.c @@ -0,0 +1,619 @@ +/* + * This file implements the X86 specific support for the perfmon2 interface + * + * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@hpl.hp.com> + * + * Copyright (c) 2007 Advanced Micro Devices, Inc. + * Contributed by Robert Richter <robert.richter@amd.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include <linux/interrupt.h> +#include <linux/perfmon_kern.h> +#include <linux/kprobes.h> +#include <linux/kdebug.h> +#include <linux/nmi.h> + +#include <asm/apic.h> + +DEFINE_PER_CPU(unsigned long, real_iip); +DEFINE_PER_CPU(int, pfm_using_nmi); + +/** + * pfm_arch_ctxswin_thread - thread context switch in + * @task: task switched in + * @ctx: context for the task + * @set: active event set + * + * Called from pfm_ctxsw(). Task is guaranteed to be current. + * set cannot be NULL. Context is locked. Interrupts are masked. + * + * Caller has already restored all PMD and PMC registers, if + * necessary (i.e., lazy restore scheme). + * + * On x86, the only common code just needs to unsecure RDPMC if necessary + * + * On model-specific features, e.g., PEBS, IBS, are taken care of in the + * corresponding PMU description module + */ +void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx) +{ + struct pfm_arch_context *ctx_arch; + + ctx_arch = pfm_ctx_arch(ctx); + + /* + * restore saved real iip + */ + if (ctx->active_set->npend_ovfls) + __get_cpu_var(real_iip) = ctx_arch->saved_real_iip; + + /* + * enable RDPMC on this CPU + */ + if (ctx_arch->flags.insecure) + set_in_cr4(X86_CR4_PCE); +} + +/** + * pfm_arch_ctxswout_thread - context switch out thread + * @task: task switched out + * @ctx : context switched out + * + * Called from pfm_ctxsw(). Task is guaranteed to be current. + * Context is locked. Interrupts are masked. Monitoring may be active. + * PMU access is guaranteed. PMC and PMD registers are live in PMU. + * + * Return: + * non-zero : did not save PMDs (as part of stopping the PMU) + * 0 : saved PMDs (no need to save them in caller) + */ +int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx) +{ + struct pfm_arch_context *ctx_arch; + struct pfm_arch_pmu_info *pmu_info; + + ctx_arch = pfm_ctx_arch(ctx); + pmu_info = pfm_pmu_info(); + + /* + * disable lazy restore of PMCS on ctxswin because + * we modify some of them. + */ + ctx->active_set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS; + + if (ctx->active_set->npend_ovfls) + ctx_arch->saved_real_iip = __get_cpu_var(real_iip); + + /* + * disable RDPMC on this CPU + */ + if (ctx_arch->flags.insecure) + clear_in_cr4(X86_CR4_PCE); + + return pmu_info->stop_save(ctx, ctx->active_set); +} + +/** + * pfm_arch_stop - deactivate monitoring + * @task: task to stop + * @ctx: context to stop + * + * Called from pfm_stop() + * Interrupts are masked. Context is locked. Set is the active set. + * + * For per-thread: + * task is not necessarily current. If not current task, then + * task is guaranteed stopped and off any cpu. Access to PMU + * is not guaranteed. + * + * must disable active monitoring. ctx cannot be NULL + */ +void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx) +{ + struct pfm_arch_pmu_info *pmu_info; + + pmu_info = pfm_pmu_info(); + + /* + * no need to go through stop_save() + * if we are already stopped + */ + if (!ctx->flags.started) + return; + + if (task != current) + return; + + pmu_info->stop_save(ctx, ctx->active_set); +} + + +/** + * pfm_arch_start - activate monitoring + * @task: task to start + * @ctx: context to stop + * + * Interrupts are masked. Context is locked. + * + * For per-thread: + * Task is not necessarily current. If not current task, then task + * is guaranteed stopped and off any cpu. No access to PMU is task + * is not current. + */ +void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx) +{ + /* + * cannot restore PMC if no access to PMU. Will be done + * when the thread is switched back in + */ + if (task != current) + return; + + pfm_arch_restore_pmcs(ctx, ctx->active_set); +} + +/** + * pfm_arch_restore_pmds - reload PMD registers + * @ctx: context to restore from + * @set: current event set + * + * function called from pfm_context_load(), pfm_ctxsw() + * + * Context is locked. Interrupts are masked. Set cannot be NULL. + * Access to the PMU is guaranteed. + */ +void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set) +{ + u16 i, num; + + num = set->nused_pmds; + + /* + * we can restore only the PMD we use because: + * + * - can only read with pfm_read_pmds() the registers + * declared used via pfm_write_pmds() + * + * - if cr4.pce=1, only counters are exposed to user. RDPMC + * does not work with other types of PMU registers.Thus, no + * address is ever exposed by counters + * + * - there is never a dependency between one pmd register and + * another + */ + for (i = 0; num; i++) { + if (likely(pfm_arch_bv_test_bit(i, set->used_pmds))) { + pfm_write_pmd(ctx, i, set->pmds[i]); + num--; + } + } +} + +/** + * pfm_arch_restore_pmcs - reload PMC registers + * @ctx: context to restore from + * @set: current event set + * + * function called from pfm_context_load(), pfm_ctxsw(). + * + * Context is locked. Interrupts are masked. set cannot be NULL. + * Access to the PMU is guaranteed. + * + * function must restore all PMC registers from set + */ +void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set) +{ + u16 i, num; + + /* + * we need to restore PMCs only when: + * - context is not masked + * - monitoring activated + * + * Masking monitoring after an overflow does not change the + * value of flags.started + */ + if (!ctx->flags.started) + return; + + /* + * restore all pmcs + * + * It is not possible to restore only the pmcs we used because + * certain PMU models (e.g. Pentium 4) have dependencies. Thus + * we do not want one application using stale PMCs coming from + * another one. + * + * On PMU models where there is no dependencies between PMCs, then + * it is possible to optimize by only restoring the registers that + * are used, but this has to be done by model-specific code. + */ + num = ctx->regs.num_pmcs; + for (i = 0; num; i++) { + if (pfm_arch_bv_test_bit(i, ctx->regs.pmcs)) { + pfm_arch_write_pmc(ctx, i, set->pmcs[i]); + num--; + } + } +} + +/** + * smp_pmu_interrupt - lowest level PMU interrupt handler for X86 + * @regs: machine state + * + * The PMU interrupt is handled through an interrupt gate, therefore + * the CPU automatically clears the EFLAGS.IF, i.e., masking interrupts. + * + * The perfmon interrupt handler MUST run with interrupts disabled due + * to possible race with other, higher priority interrupts, such as timer + * or IPI function calls. + * + * See description in IA-32 architecture manual, Vol 3 section 5.8.1 + */ +void smp_pmu_interrupt(struct pt_regs *regs) +{ + unsigned long iip; + int using_nmi; + + using_nmi = __get_cpu_var(pfm_using_nmi); + + ack_APIC_irq(); + + irq_enter(); + + /* + * when using NMI, pfm_handle_nmi() gets called + * first. It stops monitoring and record the + * iip into real_iip, then it repost the interrupt + * using the lower priority vector LOCAL_PERFMON_VECTOR + * + * On some processors, e.g., P4, it may be that some + * state is already recorded from pfm_handle_nmi() + * and it only needs to be copied back into the normal + * fields so it can be used transparently by higher level + * code. + */ + if (using_nmi) + iip = __get_cpu_var(real_iip); + else + iip = instruction_pointer(regs); + + pfm_interrupt_handler(iip, regs); + + /* + * On Intel processors: + * - it is necessary to clear the MASK field for the LVTPC + * vector. Otherwise interrupts remain masked. See + * section 8.5.1 + * AMD X86-64: + * - the documentation does not stipulate the behavior but + * it seems to work without the write, so we skip + */ + if (!using_nmi && current_cpu_data.x86_vendor == X86_VENDOR_INTEL) + apic_write(APIC_LVTPC, LOCAL_PERFMON_VECTOR); + + irq_exit(); +} + +/** + * pfm_handle_nmi - PMU NMI handler notifier callback + * @nb ; notifier block + * @val: type of die notifier + * @data: die notifier-specific data + * + * called from notify_die() notifier from an trap handler path. We only + * care about NMI related callbacks, and ignore everything else. + * + * Cannot grab any locks, include the perfmon context lock + * + * Must detect if NMI interrupt comes from perfmon, and if so it must + * stop the PMU and repost a lower-priority interrupt. The perfmon interrupt + * handler needs to grab the context lock, thus is cannot be run directly + * from the NMI interrupt call path. + */ +static int __kprobes pfm_handle_nmi(struct notifier_block *nb, + unsigned long val, + void *data) +{ + struct die_args *args = data; + struct pfm_context *ctx; + struct pfm_arch_pmu_info *pmu_info; + + /* + * only NMI related calls + */ + if (val != DIE_NMI_IPI) + return NOTIFY_DONE; + + /* + * perfmon not using NMI + */ + if (!__get_cpu_var(pfm_using_nmi)) + return NOTIFY_DONE; + + /* + * No context + */ + ctx = __get_cpu_var(pmu_ctx); + if (!ctx) { + PFM_DBG_ovfl("no ctx"); + return NOTIFY_DONE; + } + + /* + * Detect if we have overflows, i.e., NMI interrupt + * caused by PMU + */ + pmu_info = pfm_pmu_info(); + if (!pmu_info->has_ovfls(ctx)) { + PFM_DBG_ovfl("no ovfl"); + return NOTIFY_DONE; + } + + /* + * we stop the PMU to avoid further overflow before this + * one is treated by lower priority interrupt handler + */ + pmu_info->quiesce(); + + /* + * record actual instruction pointer + */ + __get_cpu_var(real_iip) = instruction_pointer(args->regs); + + /* + * post lower priority interrupt (LOCAL_PERFMON_VECTOR) + */ + pfm_arch_resend_irq(ctx); + + /* + * we need to rewrite the APIC vector on Intel + */ + if (current_cpu_data.x86_vendor == X86_VENDOR_INTEL) + apic_write(APIC_LVTPC, APIC_DM_NMI); + + /* + * the notification was for us + */ + return NOTIFY_STOP; +} + +static struct notifier_block pfm_nmi_nb = { + .notifier_call = pfm_handle_nmi +}; + +/** + * pfm_arch_resend_irq - post perfmon interrupt on regular vector + * + * called from pfm_ctxswin_thread() and pfm_handle_nmi() + */ +void pfm_arch_resend_irq(struct pfm_context *ctx) +{ + unsigned long val, dest; + /* + * we cannot use hw_resend_irq() because it goes to + * the I/O APIC. We need to go to the Local APIC. + * + * The "int vec" is not the right solution either + * because it triggers a software intr. We need + * to regenerate the interrupt and have it pended + * until we unmask interrupts. + * + * Instead we send ourself an IPI on the perfmon + * vector. + */ + val = APIC_DEST_SELF|APIC_INT_ASSERT| + APIC_DM_FIXED|LOCAL_PERFMON_VECTOR; + + dest = apic_read(APIC_ID); + apic_write(APIC_ICR2, dest); + apic_write(APIC_ICR, val); +} + +/** + * pfm_arch_pmu_acquire_percpu - setup APIC per CPU + * @data: contains pmu flags + */ +static void pfm_arch_pmu_acquire_percpu(void *data) +{ + struct pfm_arch_pmu_info *pmu_info; + unsigned int tmp, vec; + unsigned long flags = (unsigned long)data; + unsigned long lvtpc; + + pmu_info = pfm_pmu_conf->pmu_info; + /* + * we only reprogram the LVTPC vector if we have detected + * no sharing, otherwise it means the APIC is already programmed + * and we use whatever vector (likely NMI) is there + */ + if (!(flags & PFM_X86_FL_SHARING)) { + vec = LOCAL_PERFMON_VECTOR; + + tmp = apic_read(APIC_LVTERR); + apic_write(APIC_LVTERR, tmp | APIC_LVT_MASKED); + apic_write(APIC_LVTPC, vec); + apic_write(APIC_LVTERR, tmp); + } + lvtpc = (unsigned long)apic_read(APIC_LVTPC); + + __get_cpu_var(pfm_using_nmi) = lvtpc == APIC_DM_NMI; + + PFM_DBG("LTVPC=0x%lx using_nmi=%d", + lvtpc, __get_cpu_var(pfm_using_nmi)); + /* + * invoke model specific acquire routine. + */ + if (pmu_info->acquire_pmu_percpu) + pmu_info->acquire_pmu_percpu(); +} + +/** + * pfm_arch_pmu_acquire - acquire PMU resource from system + * @unavail_pmcs : bitmask to use to set unavailable pmcs + * @unavail_pmds : bitmask to use to set unavailable pmds + * + * interrupts are not masked + * + * Grab PMU registers from lower level MSR allocator + * + * Program the APIC according the possible interrupt vector + * either LOCAL_PERFMON_VECTOR or NMI + */ +int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds) +{ + struct pfm_arch_pmu_info *pmu_info; + struct pfm_regmap_desc *d; + u16 i, nlost; + + pmu_info = pfm_pmu_conf->pmu_info; + pmu_info->flags &= ~PFM_X86_FL_SHARING; + + nlost = 0; + + d = pfm_pmu_conf->pmc_desc; + for (i = 0; i < pfm_pmu_conf->num_pmc_entries; i++, d++) { + if (!(d->type & PFM_REG_I)) + continue; + + /* + * reserve register with lower-level allocator + */ + if (!reserve_evntsel_nmi(d->hw_addr)) { + PFM_DBG("pmc%d(%s) already used", i, d->desc); + pfm_arch_bv_set_bit(i, unavail_pmcs); + nlost++; + continue; + } + } + PFM_DBG("nlost=%d info_flags=0x%x\n", nlost, pmu_info->flags); + /* + * some PMU models (e.g., P6) do not support sharing + * so check if we found less than the expected number of PMC registers + */ + if (nlost) { + if (pmu_info->flags & PFM_X86_FL_NO_SHARING) { + PFM_INFO("PMU already used by another subsystem, " + "PMU does not support sharing, " + "try disabling Oprofile or " + "reboot with nmi_watchdog=0"); + goto undo; + } + pmu_info->flags |= PFM_X86_FL_SHARING; + } + + d = pfm_pmu_conf->pmd_desc; + for (i = 0; i < pfm_pmu_conf->num_pmd_entries; i++, d++) { + if (!(d->type & PFM_REG_I)) + continue; + + if (!reserve_perfctr_nmi(d->hw_addr)) { + PFM_DBG("pmd%d(%s) already used", i, d->desc); + pfm_arch_bv_set_bit(i, unavail_pmds); + } + } + /* + * program APIC on each CPU + */ + on_each_cpu(pfm_arch_pmu_acquire_percpu, + (void *)(unsigned long)pmu_info->flags , 1); + + return 0; +undo: + /* + * must undo reservation of pmcs in case of error + */ + d = pfm_pmu_conf->pmc_desc; + for (i = 0; i < pfm_pmu_conf->num_pmc_entries; i++, d++) { + if (!(d->type & PFM_REG_I)) + continue; + if (!pfm_arch_bv_test_bit(i, unavail_pmcs)) + release_evntsel_nmi(d->hw_addr); + } + return -EBUSY; +} + +/** + * pfm-arch_pmu_release_percpu - clear NMI state for one CPU + * + */ +static void pfm_arch_pmu_release_percpu(void *data) +{ + struct pfm_arch_pmu_info *pmu_info; + + pmu_info = pfm_pmu_conf->pmu_info; + + __get_cpu_var(pfm_using_nmi) = 0; + /* + * invoke model specific release routine. + */ + if (pmu_info->release_pmu_percpu) + pmu_info->release_pmu_percpu(); +} + +/** + * pfm_arch_pmu_release - release PMU resource to system + * + * called from pfm_pmu_release() + * interrupts are not masked + * + * On x86, we return the PMU registers to the MSR allocator + */ +void pfm_arch_pmu_release(void) +{ + struct pfm_regmap_desc *d; + u16 i, n; + + d = pfm_pmu_conf->pmc_desc; + n = pfm_pmu_conf->regs_all.num_pmcs; + for (i = 0; n; i++, d++) { + if (!pfm_arch_bv_test_bit(i, pfm_pmu_conf->regs_all.pmcs)) + continue; + release_evntsel_nmi(d->hw_addr); + n--; + PFM_DBG("pmc%u released", i); + } + d = pfm_pmu_conf->pmd_desc; + n = pfm_pmu_conf->regs_all.num_pmds; + for (i = 0; n; i++, d++) { + if (!pfm_arch_bv_test_bit(i, pfm_pmu_conf->regs_all.pmds)) + continue; + release_perfctr_nmi(d->hw_addr); + n--; + PFM_DBG("pmd%u released", i); + } + + /* clear NMI variable if used */ + if (__get_cpu_var(pfm_using_nmi)) + on_each_cpu(pfm_arch_pmu_release_percpu, NULL , 1); +} + +/** + * pfm_arch_init - one time global arch-specific initialization + * + * called from pfm_init() + */ +int __init pfm_arch_init(void) +{ + /* + * we need to register our NMI handler when the kernels boots + * to avoid a deadlock condition with the NMI watchdog or Oprofile + * if we were to try and register/unregister on-demand. + */ + register_die_notifier(&pfm_nmi_nb); + return 0; +} diff --git a/arch/x86/perfmon/perfmon_amd64.c b/arch/x86/perfmon/perfmon_amd64.c new file mode 100644 index 000000000000..f078fe28137d --- /dev/null +++ b/arch/x86/perfmon/perfmon_amd64.c @@ -0,0 +1,483 @@ +/* + * This file contains the PMU description for the Athlon64 and Opteron64 + * processors. It supports 32 and 64-bit modes. + * + * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@hpl.hp.com> + * + * Copyright (c) 2007 Advanced Micro Devices, Inc. + * Contributed by Robert Richter <robert.richter@amd.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include <linux/kprobes.h> +#include <linux/vmalloc.h> +#include <linux/topology.h> +#include <linux/pci.h> +#include <linux/perfmon_kern.h> +#include <asm/hw_irq.h> +#include <asm/apic.h> + +static void __kprobes pfm_amd64_quiesce(void); +static int pfm_amd64_has_ovfls(struct pfm_context *ctx); +static int pfm_amd64_stop_save(struct pfm_context *ctx, + struct pfm_event_set *set); + +static u64 enable_mask[PFM_MAX_PMCS]; +static u16 max_enable; + +static struct pfm_arch_pmu_info pfm_amd64_pmu_info = { + .stop_save = pfm_amd64_stop_save, + .has_ovfls = pfm_amd64_has_ovfls, + .quiesce = pfm_amd64_quiesce, +}; + +/* + * force Local APIC interrupt on overflow + */ +#define PFM_K8_VAL (1ULL<<20) +#define PFM_K8_NO64 (1ULL<<20) + +/* + * reserved bits must be 1 + * + * for family 15: + * - upper 32 bits are reserved + * - bit 20, bit 21 + * + * for family 16: + * - bits 36-39 are reserved + * - bits 42-63 are reserved + * - bit 20, bit 21 + * + */ +#define PFM_K8_RSVD ((~((1ULL<<32)-1)) | (1ULL<<20) | (1ULL<<21)) +#define PFM_16_RSVD ((0x3fffffULL<<42) | (0xfULL<<36) | (1ULL<<20) | (1ULL<<21)) + +static struct pfm_regmap_desc pfm_amd64_pmc_desc[] = { +/* pmc0 */ PMC_D(PFM_REG_I64, "PERFSEL0", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL0), +/* pmc1 */ PMC_D(PFM_REG_I64, "PERFSEL1", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL1), +/* pmc2 */ PMC_D(PFM_REG_I64, "PERFSEL2", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL2), +/* pmc3 */ PMC_D(PFM_REG_I64, "PERFSEL3", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL3), +}; +#define PFM_AMD_NUM_PMCS ARRAY_SIZE(pfm_amd64_pmc_desc) + +/* + * AMD64 counters are 48 bits, upper bits are reserved + */ +#define PFM_AMD64_CTR_RSVD (~((1ULL<<48)-1)) + +#define PFM_AMD_D(n) \ + { .type = PFM_REG_C, \ + .desc = "PERFCTR"#n, \ + .hw_addr = MSR_K7_PERFCTR0+n, \ + .rsvd_msk = PFM_AMD64_CTR_RSVD, \ + .dep_pmcs[0] = 1ULL << n \ + } + +static struct pfm_regmap_desc pfm_amd64_pmd_desc[] = { +/* pmd0 */ PFM_AMD_D(0), +/* pmd1 */ PFM_AMD_D(1), +/* pmd2 */ PFM_AMD_D(2), +/* pmd3 */ PFM_AMD_D(3) +}; +#define PFM_AMD_NUM_PMDS ARRAY_SIZE(pfm_amd64_pmd_desc) + +static struct pfm_context *pfm_nb_task_owner; + +static struct pfm_pmu_config pfm_amd64_pmu_conf; + +/** + * pfm_amd64_acquire_nb -- ensure mutual exclusion for Northbridge events + * @ctx: context to use + * + * There can only be one user per socket for the Northbridge (NB) events, + * so we enforce mutual exclusion as follows: + * - per-thread : only one context machine-wide can use NB events + * + * Exclusion is enforced at: + * - pfm_load_context() + * - pfm_write_pmcs() for attached contexts + * + * Exclusion is released at: + * - pfm_unload_context() or any calls that implicitely uses it + * + * return: + * 0 : successfully acquire NB access + * < 0: errno, failed to acquire NB access + */ +static int pfm_amd64_acquire_nb(struct pfm_context *ctx) +{ + struct pfm_context **entry, *old; + int proc_id; + +#ifdef CONFIG_SMP + proc_id = cpu_data(smp_processor_id()).phys_proc_id; +#else + proc_id = 0; +#endif + + entry = &pfm_nb_task_owner; + + old = cmpxchg(entry, NULL, ctx); + if (!old) { + PFM_DBG("acquired Northbridge event access globally"); + } else if (old != ctx) { + PFM_DBG("global NorthBridge event conflict"); + return -EBUSY; + } + return 0; +} + +/** + * pfm_amd64_pmc_write_check -- check validity of pmc writes + * @ctx: context to use + * @set: event set to use + * @req: user request to modify the pmc + * + * invoked from pfm_write_pmcs() when pfm_nb_sys_owners is not NULL,i.e., + * when we have detected a multi-core processor. + * + * context is locked, interrupts are masked + */ +static int pfm_amd64_pmc_write_check(struct pfm_context *ctx, + struct pfm_event_set *set, + struct pfarg_pmr *req) +{ + unsigned int event; + + /* + * delay checking NB event until we load the context + */ + if (ctx->state == PFM_CTX_UNLOADED) + return 0; + + /* + * check event is NB event + */ + event = (unsigned int)(req->reg_value & 0xff); + if (event < 0xee) + return 0; + + return pfm_amd64_acquire_nb(ctx); +} + +/** + * pfm_amd64_load_context - amd64 model-specific load callback + * @ctx: context to use + * + * invoked on pfm_load_context(). + * context is locked, interrupts are masked + */ +static int pfm_amd64_load_context(struct pfm_context *ctx) +{ + struct pfm_event_set *set; + unsigned int i, n; + + set = ctx->active_set; + n = set->nused_pmcs; + for (i = 0; n; i++) { + if (!pfm_arch_bv_test_bit(i, set->used_pmcs)) + continue; + + if ((set->pmcs[i] & 0xff) >= 0xee) + goto found; + n--; + } + return 0; +found: + return pfm_amd64_acquire_nb(ctx); +} + +/** + * pfm_amd64_unload_context -- amd64 mdoels-specific unload callback + * @ctx: context to use + * + * invoked on pfm_unload_context() + */ +static void pfm_amd64_unload_context(struct pfm_context *ctx) +{ + struct pfm_context **entry, *old; + int proc_id; + +#ifdef CONFIG_SMP + proc_id = cpu_data(smp_processor_id()).phys_proc_id; +#else + proc_id = 0; +#endif + + entry = &pfm_nb_task_owner; + + old = cmpxchg(entry, ctx, NULL); + if (old == ctx) + PFM_DBG("released NorthBridge events globally"); +} + +/** + * pfm_amd64_setup_nb_event_ctrl -- initialize NB event controls + * + * detect if we need to activate NorthBridge event access control + */ +static int pfm_amd64_setup_nb_event_ctrl(void) +{ + unsigned int c, n = 0; + unsigned int max_phys = 0; + +#ifdef CONFIG_SMP + for_each_possible_cpu(c) { + if (cpu_data(c).phys_proc_id > max_phys) + max_phys = cpu_data(c).phys_proc_id; + } +#else + max_phys = 0; +#endif + if (max_phys > 255) { + PFM_INFO("socket id %d is too big to handle", max_phys); + return -ENOMEM; + } + + n = max_phys + 1; + if (n < 2) + return 0; + + pfm_nb_task_owner = NULL; + + /* + * activate write-checker for PMC registers + */ + for (c = 0; c < PFM_AMD_NUM_PMCS; c++) + pfm_amd64_pmc_desc[c].type |= PFM_REG_WC; + + pfm_amd64_pmu_info.load_context = pfm_amd64_load_context; + pfm_amd64_pmu_info.unload_context = pfm_amd64_unload_context; + + pfm_amd64_pmu_conf.pmc_write_check = pfm_amd64_pmc_write_check; + + PFM_INFO("NorthBridge event access control enabled"); + + return 0; +} + +/** + * pfm_amd64_setup_register -- initialize register table + * + * modify register table based on actual host CPU + */ +static void pfm_amd64_setup_registers(void) +{ + u16 i; + + pfm_arch_bv_set_bit(0, enable_mask); + pfm_arch_bv_set_bit(1, enable_mask); + pfm_arch_bv_set_bit(2, enable_mask); + pfm_arch_bv_set_bit(3, enable_mask); + max_enable = 3+1; + + /* + * adjust reserved bit fields for family 16 + */ + if (current_cpu_data.x86 == 16) { + for (i = 0; i < PFM_AMD_NUM_PMCS; i++) + if (pfm_amd64_pmc_desc[i].rsvd_msk == PFM_K8_RSVD) + pfm_amd64_pmc_desc[i].rsvd_msk = PFM_16_RSVD; + } +} + +/** + * pfm_amd64_probe_pmu -- detect host PMU + */ +static int pfm_amd64_probe_pmu(void) +{ + if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) + return -1; + + switch (current_cpu_data.x86) { + case 6: + case 15: + case 16: + PFM_INFO("found family=%d", current_cpu_data.x86); + break; + default: + PFM_INFO("unsupported family=%d", current_cpu_data.x86); + return -1; + } + + /* + * check for local APIC (required) + */ + if (!cpu_has_apic) { + PFM_INFO("no local APIC, unsupported"); + return -1; + } + + if (current_cpu_data.x86_max_cores > 1 + && pfm_amd64_setup_nb_event_ctrl()) + return -1; + + pfm_amd64_setup_registers(); + + return 0; +} + +/** + * pfm_amd64_has_ovfls -- detect if pending overflows + * @ctx: context to use + * + * detect is counters have overflowed. + * return: + * 0 : no overflow + * 1 : at least one overflow + */ +static int __kprobes pfm_amd64_has_ovfls(struct pfm_context *ctx) +{ + struct pfm_regmap_desc *xrd; + u64 *cnt_mask; + u64 wmask, val; + u16 i, num; + + /* + * Check regular counters + */ + cnt_mask = ctx->regs.cnt_pmds; + num = ctx->regs.num_counters; + wmask = 1ULL << pfm_pmu_conf->counter_width; + xrd = pfm_amd64_pmd_desc; + + for (i = 0; num; i++) { + if (pfm_arch_bv_test_bit(i, cnt_mask)) { + rdmsrl(xrd[i].hw_addr, val); + if (!(val & wmask)) + return 1; + num--; + } + } + return 0; +} + +/** + * pfm_amd64_stop_save - stop monitoring, collect pending overflows + * @ctx: context to use + * @set: event set to stop + * + * interrupts are masked, PMU access guaranteed + */ +static int pfm_amd64_stop_save(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + struct pfm_arch_pmu_info *pmu_info; + u64 used_mask[PFM_PMC_BV]; + u64 *cnt_pmds; + u64 val, wmask, ovfl_mask; + u32 i, count; + + pmu_info = pfm_pmu_info(); + + wmask = 1ULL << pfm_pmu_conf->counter_width; + + pfm_arch_bv_and(used_mask, + set->used_pmcs, + enable_mask, + max_enable); + + count = pfm_arch_bv_weight(used_mask, max_enable); + + /* + * stop monitoring + * Unfortunately, this is very expensive! + * wrmsrl() is serializing. + */ + for (i = 0; count; i++) { + if (pfm_arch_bv_test_bit(i, used_mask)) { + wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0); + count--; + } + } + + /* + * if we already having a pending overflow condition, we simply + * return to take care of this first. + */ + if (set->npend_ovfls) + return 1; + + ovfl_mask = pfm_pmu_conf->ovfl_mask; + cnt_pmds = ctx->regs.cnt_pmds; + + /* + * check for pending overflows and save PMDs (combo) + * we employ used_pmds because we also need to save + * and not just check for pending interrupts. + */ + count = set->nused_pmds; + for (i = 0; count; i++) { + if (pfm_arch_bv_test_bit(i, set->used_pmds)) { + val = pfm_arch_read_pmd(ctx, i); + if (likely(pfm_arch_bv_test_bit(i, cnt_pmds))) { + if (!(val & wmask)) { + pfm_arch_bv_set_bit(i,set->povfl_pmds); + set->npend_ovfls++; + } + val = (set->pmds[i] & ~ovfl_mask) + | (val & ovfl_mask); + } + set->pmds[i] = val; + count--; + } + } + /* 0 means: no need to save PMDs at upper level */ + return 0; +} + +/** + * pfm_amd64_quiesce_pmu -- stop monitoring without grabbing any lock + * + * called from NMI interrupt handler to immediately stop monitoring + * cannot grab any lock, including perfmon related locks + */ +static void __kprobes pfm_amd64_quiesce(void) +{ + /* + * quiesce PMU by clearing available registers that have + * the start/stop capability + */ + if (pfm_arch_bv_test_bit(0, pfm_pmu_conf->regs_all.pmcs)) + wrmsrl(MSR_K7_EVNTSEL0, 0); + if (pfm_arch_bv_test_bit(1, pfm_pmu_conf->regs_all.pmcs)) + wrmsrl(MSR_K7_EVNTSEL0+1, 0); + if (pfm_arch_bv_test_bit(2, pfm_pmu_conf->regs_all.pmcs)) + wrmsrl(MSR_K7_EVNTSEL0+2, 0); + if (pfm_arch_bv_test_bit(3, pfm_pmu_conf->regs_all.pmcs)) + wrmsrl(MSR_K7_EVNTSEL0+3, 0); +} + +static struct pfm_pmu_config pfm_amd64_pmu_conf = { + .pmu_name = "AMD64", + .counter_width = 47, + .pmd_desc = pfm_amd64_pmd_desc, + .pmc_desc = pfm_amd64_pmc_desc, + .num_pmc_entries = PFM_AMD_NUM_PMCS, + .num_pmd_entries = PFM_AMD_NUM_PMDS, + .version = "1.2", + .pmu_info = &pfm_amd64_pmu_info +}; + +static int __init pfm_amd64_pmu_init_module(void) +{ + if (pfm_amd64_probe_pmu()) + return -ENOSYS; + return pfm_pmu_register(&pfm_amd64_pmu_conf); +} + +device_initcall(pfm_amd64_pmu_init_module); diff --git a/arch/x86/perfmon/perfmon_intel_arch.c b/arch/x86/perfmon/perfmon_intel_arch.c new file mode 100644 index 000000000000..ce4293dcfcda --- /dev/null +++ b/arch/x86/perfmon/perfmon_intel_arch.c @@ -0,0 +1,628 @@ +/* + * This file contains the Intel architectural perfmon v1, v2, v3 + * description tables. + * + * Architectural perfmon was introduced with Intel Core Solo/Duo + * processors. + * + * Copyright (c) 2006-2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@hpl.hp.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include <linux/kprobes.h> +#include <linux/perfmon_kern.h> +#include <asm/msr.h> +#include <asm/apic.h> + +static u64 enable_mask[PFM_MAX_PMCS]; +static u16 max_enable; +static int pfm_intel_arch_version; + +DEFINE_PER_CPU(u64, saved_global_ctrl); + +/* + * - upper 32 bits are reserved + * - INT: APIC enable bit is reserved (forced to 1) + * - bit 21 is reserved + * + * RSVD: reserved bits are 1 + */ +#define PFM_IA_PMC_RSVD ((~((1ULL<<32)-1)) \ + | (1ULL<<20) \ + | (1ULL<<21)) + +/* + * force Local APIC interrupt on overflow + * disable with NO_EMUL64 + */ +#define PFM_IA_PMC_VAL (1ULL<<20) +#define PFM_IA_NO64 (1ULL<<20) + +/* + * architectuture specifies that: + * IA32_PMCx MSR : starts at 0x0c1 & occupy a contiguous block of MSR + * IA32_PERFEVTSELx MSR : starts at 0x186 & occupy a contiguous block of MSR + * MSR_GEN_FIXED_CTR0 : starts at 0x309 & occupy a contiguous block of MSR + */ +#define MSR_GEN_SEL_BASE MSR_P6_EVNTSEL0 +#define MSR_GEN_PMC_BASE MSR_P6_PERFCTR0 +#define MSR_GEN_FIXED_PMC_BASE MSR_CORE_PERF_FIXED_CTR0 + +/* + * layout of EAX for CPUID.0xa leaf function + */ +struct pmu_eax { + unsigned int version:8; /* architectural perfmon version */ + unsigned int num_cnt:8; /* number of generic counters */ + unsigned int cnt_width:8; /* width of generic counters */ + unsigned int ebx_length:8; /* number of architected events */ +}; + +/* + * layout of EDX for CPUID.0xa leaf function when perfmon v2 is detected + */ +struct pmu_edx { + unsigned int num_cnt:5; /* number of fixed counters */ + unsigned int cnt_width:8; /* width of fixed counters */ + unsigned int reserved:19; +}; + +static void pfm_intel_arch_acquire_pmu_percpu(void); +static void pfm_intel_arch_release_pmu_percpu(void); +static int pfm_intel_arch_stop_save(struct pfm_context *ctx, + struct pfm_event_set *set); +static int pfm_intel_arch_has_ovfls(struct pfm_context *ctx); +static void __kprobes pfm_intel_arch_quiesce(void); + +/* + * physical addresses of MSR controlling the perfevtsel and counter registers + */ +struct pfm_arch_pmu_info pfm_intel_arch_pmu_info = { + .stop_save = pfm_intel_arch_stop_save, + .has_ovfls = pfm_intel_arch_has_ovfls, + .quiesce = pfm_intel_arch_quiesce, + .acquire_pmu_percpu = pfm_intel_arch_acquire_pmu_percpu, + .release_pmu_percpu = pfm_intel_arch_release_pmu_percpu +}; + +#define PFM_IA_C(n) { \ + .type = PFM_REG_I64, \ + .desc = "PERFEVTSEL"#n, \ + .dfl_val = PFM_IA_PMC_VAL, \ + .rsvd_msk = PFM_IA_PMC_RSVD, \ + .no_emul64_msk = PFM_IA_NO64, \ + .hw_addr = MSR_GEN_SEL_BASE+(n) \ + } + +#define PFM_IA_D(n) \ + { .type = PFM_REG_C, \ + .desc = "PMC"#n, \ + .hw_addr = MSR_P6_PERFCTR0+n, \ + .dep_pmcs[0] = 1ULL << n \ + } + +#define PFM_IA_FD(n) \ + { .type = PFM_REG_C, \ + .desc = "FIXED_CTR"#n, \ + .hw_addr = MSR_CORE_PERF_FIXED_CTR0+n,\ + .dep_pmcs[0] = 1ULL << 16 \ + } + + +static struct pfm_regmap_desc pfm_intel_arch_pmc_desc[] = { +/* pmc0 */ PFM_IA_C(0), PFM_IA_C(1), PFM_IA_C(2), PFM_IA_C(3), +/* pmc4 */ PFM_IA_C(4), PFM_IA_C(5), PFM_IA_C(6), PFM_IA_C(7), +/* pmc8 */ PFM_IA_C(8), PFM_IA_C(9), PFM_IA_C(10), PFM_IA_C(11), +/* pmc12 */ PFM_IA_C(12), PFM_IA_C(13), PFM_IA_C(14), PFM_IA_C(15), + +/* pmc16 */ { .type = PFM_REG_I, + .desc = "FIXED_CTRL", + .dfl_val = 0x8888888888888888ULL, /* force PMI */ + .rsvd_msk = 0, /* set dynamically */ + .no_emul64_msk = 0, + .hw_addr = MSR_CORE_PERF_FIXED_CTR_CTRL + }, +}; +#define PFM_IA_MAX_PMCS ARRAY_SIZE(pfm_intel_arch_pmc_desc) + +static struct pfm_regmap_desc pfm_intel_arch_pmd_desc[] = { +/* pmd0 */ PFM_IA_D(0), PFM_IA_D(1), PFM_IA_D(2), PFM_IA_D(3), +/* pmd4 */ PFM_IA_D(4), PFM_IA_D(5), PFM_IA_D(6), PFM_IA_D(7), +/* pmd8 */ PFM_IA_D(8), PFM_IA_D(9), PFM_IA_D(10), PFM_IA_D(11), +/* pmd12 */ PFM_IA_D(12), PFM_IA_D(13), PFM_IA_D(14), PFM_IA_D(15), + +/* pmd16 */ PFM_IA_FD(0), PFM_IA_FD(1), PFM_IA_FD(2), PFM_IA_FD(3), +/* pmd20 */ PFM_IA_FD(4), PFM_IA_FD(5), PFM_IA_FD(6), PFM_IA_FD(7), +/* pmd24 */ PFM_IA_FD(8), PFM_IA_FD(9), PFM_IA_FD(10), PFM_IA_FD(11), +/* pmd28 */ PFM_IA_FD(16), PFM_IA_FD(17), PFM_IA_FD(18), PFM_IA_FD(19) +}; +#define PFM_IA_MAX_PMDS ARRAY_SIZE(pfm_intel_arch_pmd_desc) + +#define PFM_IA_MAX_CNT 16 /* # generic counters in mapping table */ +#define PFM_IA_MAX_FCNT 16 /* # of fixed counters in mapping table */ +#define PFM_IA_FCNT_BASE 16 /* base index of fixed counters PMD */ + +static struct pfm_pmu_config pfm_intel_arch_pmu_conf; + +static void pfm_intel_arch_check_errata(void) +{ + /* + * Core Duo errata AE49 (no fix). Both counters share a single + * enable bit in PERFEVTSEL0 + */ + if (current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 14) + pfm_intel_arch_pmu_info.flags |= PFM_X86_FL_NO_SHARING; +} + +static inline void set_enable_mask(unsigned int i) +{ + pfm_arch_bv_set_bit(i, enable_mask); + + /* max_enable = highest + 1 */ + if ((i+1) > max_enable) + max_enable = i+ 1; +} + +static void pfm_intel_arch_setup_generic(unsigned int version, + unsigned int width, + unsigned int count) +{ + u64 rsvd; + unsigned int i; + + /* + * first we handle the generic counters: + * + * - ensure HW does not have more registers than hardcoded in the tables + * - adjust rsvd_msk to actual counter width + * - initialize enable_mask (list of PMC with start/stop capability) + * - mark unused hardcoded generic counters as unimplemented + */ + + /* + * min of number of Hw counters and hardcoded in the tables + */ + if (count >= PFM_IA_MAX_CNT) { + printk(KERN_INFO "perfmon: Limiting number of generic counters" + " to %u, HW supports %u", + PFM_IA_MAX_CNT, count); + count = PFM_IA_MAX_CNT; + } + + /* + * adjust rsvd_msk for generic counters based on actual width + * initialize enable_mask (1 per pmd) + */ + rsvd = ~((1ULL << width)-1); + for (i = 0; i < count; i++) { + pfm_intel_arch_pmd_desc[i].rsvd_msk = rsvd; + set_enable_mask(i); + } + + /* + * handle version 3 new anythread bit (21) + */ + if (version == 3) { + for (i = 0; i < count; i++) + pfm_intel_arch_pmc_desc[i].rsvd_msk &= ~(1ULL << 21); + } + + + /* + * mark unused generic counters as not available + */ + for (i = count ; i < PFM_IA_MAX_CNT; i++) { + pfm_intel_arch_pmd_desc[i].type = PFM_REG_NA; + pfm_intel_arch_pmc_desc[i].type = PFM_REG_NA; + } +} + +static void pfm_intel_arch_setup_fixed(unsigned int version, + unsigned int width, + unsigned int count) +{ + u64 rsvd, dfl; + unsigned int i; + + /* + * handle the fixed counters (if any): + * + * - ensure HW does not have more registers than hardcoded in the tables + * - adjust rsvd_msk to actual counter width + * - initialize enable_mask (list of PMC with start/stop capability) + * - mark unused hardcoded generic counters as unimplemented + */ + if (count >= PFM_IA_MAX_FCNT) { + printk(KERN_INFO "perfmon: Limiting number of fixed counters" + " to %u, HW supports %u", + PFM_IA_MAX_FCNT, count); + count = PFM_IA_MAX_FCNT; + } + /* + * adjust rsvd_msk for fixed counters based on actual width + */ + rsvd = ~((1ULL << width)-1); + for (i = 0; i < count; i++) + pfm_intel_arch_pmd_desc[PFM_IA_FCNT_BASE+i].rsvd_msk = rsvd; + + /* + * handle version new anythread bit (bit 2) + */ + if (version == 3) + rsvd = 1ULL << 3; + else + rsvd = 3ULL << 2; + + pfm_intel_arch_pmc_desc[16].rsvd_msk = 0; + for (i = 0; i < count; i++) + pfm_intel_arch_pmc_desc[16].rsvd_msk |= rsvd << (i<<2); + + /* + * mark unused fixed counters as unimplemented + * + * update the rsvd_msk, dfl_val in FIXED_CTRL: + * - rsvd_msk: set all 4 bits + * - dfl_val : clear all 4 bits + */ + dfl = pfm_intel_arch_pmc_desc[16].dfl_val; + rsvd = pfm_intel_arch_pmc_desc[16].rsvd_msk; + + for (i = count ; i < PFM_IA_MAX_FCNT; i++) { + pfm_intel_arch_pmd_desc[PFM_IA_FCNT_BASE+i].type = PFM_REG_NA; + rsvd |= 0xfULL << (i<<2); + dfl &= ~(0xfULL << (i<<2)); + } + + /* + * FIXED_CTR_CTRL unavailable when no fixed counters are defined + */ + if (!count) { + pfm_intel_arch_pmc_desc[16].type = PFM_REG_NA; + } else { + /* update rsvd_mask and dfl_val */ + pfm_intel_arch_pmc_desc[16].rsvd_msk = rsvd; + pfm_intel_arch_pmc_desc[16].dfl_val = dfl; + set_enable_mask(16); + } +} + +static int pfm_intel_arch_probe_pmu(void) +{ + union { + unsigned int val; + struct pmu_eax eax; + struct pmu_edx edx; + } eax, edx; + unsigned int ebx, ecx; + unsigned int width = 0; + + edx.val = 0; + + if (!cpu_has_arch_perfmon) { + PFM_INFO("no support for Intel architectural PMU"); + return -1; + } + + if (!cpu_has_apic) { + PFM_INFO("no Local APIC, try rebooting with lapic option"); + return -1; + } + + /* cpuid() call protected by cpu_has_arch_perfmon */ + cpuid(0xa, &eax.val, &ebx, &ecx, &edx.val); + + /* + * some 6/15 models have buggy BIOS + */ + if (eax.eax.version == 0 + && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 15) { + PFM_INFO("buggy v2 BIOS, adjusting for 2 generic counters"); + eax.eax.version = 2; + eax.eax.num_cnt = 2; + eax.eax.cnt_width = 40; + } + + /* + * some v2 BIOSes are incomplete + */ + if (eax.eax.version == 2 && !edx.edx.num_cnt) { + PFM_INFO("buggy v2 BIOS, adjusting for 3 fixed counters"); + edx.edx.num_cnt = 3; + edx.edx.cnt_width = 40; + } + + /* + * no fixed counters on earlier versions + */ + if (eax.eax.version < 2) { + edx.val = 0; + } else { + /* + * use the min value of both widths until we support + * variable width counters + */ + width = eax.eax.cnt_width < edx.edx.cnt_width ? + eax.eax.cnt_width : edx.edx.cnt_width; + } + + /* + * Intel Atom processors have a buggy firmware which does not report + * the correct number of fixed counters + */ + if (eax.eax.version == 3 && edx.edx.num_cnt < 3 + && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 28) { + PFM_INFO("buggy v3 BIOS, adjusting for 3 fixed counters"); + edx.edx.num_cnt = 3; + } + + PFM_INFO("detected architecural perfmon v%d", eax.eax.version); + PFM_INFO("num_gen=%d width=%d num_fixed=%d width=%d", + eax.eax.num_cnt, + eax.eax.cnt_width, + edx.edx.num_cnt, + edx.edx.cnt_width); + + pfm_intel_arch_setup_generic(eax.eax.version, + width, + eax.eax.num_cnt); + + pfm_intel_arch_setup_fixed(eax.eax.version, + width, + edx.edx.num_cnt); + + pfm_intel_arch_check_errata(); + + pfm_intel_arch_version = eax.eax.version; + + return 0; +} + +/** + * pfm_intel_arch_has_ovfls - check for pending overflow condition + * @ctx: context to work on + * + * detect if counters have overflowed. + * return: + * 0 : no overflow + * 1 : at least one overflow + */ +static int __kprobes pfm_intel_arch_has_ovfls(struct pfm_context *ctx) +{ + u64 *cnt_mask; + u64 wmask, val; + u16 i, num; + + cnt_mask = ctx->regs.cnt_pmds; + num = ctx->regs.num_counters; + wmask = 1ULL << pfm_pmu_conf->counter_width; + + /* + * we can leverage the fact that we know the mapping + * to hardcode the MSR address and avoid accessing + * more cachelines + * + * We need to check cnt_mask because not all registers + * may be available. + */ + for (i = 0; num; i++) { + if (pfm_arch_bv_test_bit(i, cnt_mask)) { + rdmsrl(pfm_intel_arch_pmd_desc[i].hw_addr, val); + if (!(val & wmask)) + return 1; + num--; + } + } + return 0; +} + +static int pfm_intel_arch_stop_save(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + u64 used_mask[PFM_PMC_BV]; + u64 val, wmask, ovfl_mask; + u32 i, count; + + wmask = 1ULL << pfm_pmu_conf->counter_width; + + pfm_arch_bv_and(used_mask, + set->used_pmcs, + enable_mask, + max_enable); + + count = pfm_arch_bv_weight(used_mask, max_enable); + + /* + * stop monitoring + * Unfortunately, this is very expensive! + * wrmsrl() is serializing. + */ + for (i = 0; count; i++) { + if (pfm_arch_bv_test_bit(i, used_mask)) { + wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0); + count--; + } + } + + /* + * if we already having a pending overflow condition, we simply + * return to take care of this first. + */ + if (set->npend_ovfls) + return 1; + + ovfl_mask = pfm_pmu_conf->ovfl_mask; + + /* + * check for pending overflows and save PMDs (combo) + * we employ used_pmds because we also need to save + * and not just check for pending interrupts. + * + * all pmds are counters + */ + count = set->nused_pmds; + for (i = 0; count; i++) { + if (pfm_arch_bv_test_bit(i, set->used_pmds)) { + val = pfm_arch_read_pmd(ctx, i); + if (!(val & wmask)) { + pfm_arch_bv_set_bit(i, set->povfl_pmds); + set->npend_ovfls++; + } + val = (set->pmds[i] & ~ovfl_mask) + | (val & ovfl_mask); + set->pmds[i] = val; + count--; + } + } + /* 0 means: no need to save PMDs at upper level */ + return 0; +} + +/** + * pfm_intel_arch_quiesce - stop monitoring without grabbing any lock + * + * called from NMI interrupt handler to immediately stop monitoring + * cannot grab any lock, including perfmon related locks + */ +static void __kprobes pfm_intel_arch_quiesce(void) +{ + u16 i; + + /* + * PMC16 is the fixed control register so it has a + * distinct MSR address + * + * We do not use the hw_addr field in the table to avoid touching + * too many cachelines + */ + for (i = 0; i < pfm_pmu_conf->regs_all.max_pmc; i++) { + if (pfm_arch_bv_test_bit(i, pfm_pmu_conf->regs_all.pmcs)) { + if (i == 16) + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); + else + wrmsrl(MSR_P6_EVNTSEL0+i, 0); + } + } +} +/** +* pfm_intel_arch_release_pmu_percpu - release PMU resource per CPU +* +* Since v2, there exists global control MSR, to start/stop and +* also collect overflow status information. In particular, +* GLOBAL_CTRL controls start/stop and has one bit per counter. +* To maintain backward compatibility with v1, the power-on value +* of GLOBAL_CTRL should be such that generic counters are enabled +* but fixed counters are disabled (true on Penryn and Atom currently). +* +* Here, we simply make sure that all available counters are enabled. +* After that, start/stop is controlled on a per-counter basis. +*/ +static void pfm_intel_arch_acquire_pmu_percpu(void) +{ + struct pfm_regmap_desc *d; + u64 mask = 0; + unsigned int i; + + /* nothing to do for v1 */ + if (pfm_intel_arch_version < 2) + return; + + /* + * build bitmask of registers that are available to + * us. In some cases, there may be fewer registers than + * what the PMU supports due to sharing with other kernel + * subsystems, such as NMI + */ + d = pfm_pmu_conf->pmd_desc; + for (i=0; i < 16; i++) { + if ((d[i].type & PFM_REG_I) == 0) + continue; + mask |= 1ull << i; + } + for (i=16; i < PFM_IA_MAX_PMDS; i++) { + if ((d[i].type & PFM_REG_I) == 0) + continue; + mask |= 1ull << (32+i-16); + } + /* + * keep a local copy of the current MSR_CORE_PERF_GLOBAL_CTRL + */ + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, __get_cpu_var(saved_global_ctrl)); + + PFM_DBG("global=0x%llx set to 0x%llx", + __get_cpu_var(saved_global_ctrl), + mask); + /* + * enable all registers + * + * No need to quiesce PMU. If there is a overflow, it will be + * treated as spurious by the handler + */ + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, mask); +} + +/** +* pfm_intel_arch_release_pmu_percpu - release PMU resource per CPU +* +* Since v2, there exists global control MSR, to start/stop and +* also collect overflow status information. In particular, +* GLOBAL_CTRL controls start/stop and has one bit per counter. +* To maintain backward compatibility with v1, the power-on value +* of GLOBAL_CTRL should be such that generic counters are enabled +* but fixed counters are disabled (true on Penryn and Atom currently). +* +* Here, we are done using the PMU. so we restore the power-on value. +*/ +static void pfm_intel_arch_release_pmu_percpu(void) +{ + /* nothing to do for v1 */ + if (pfm_intel_arch_version < 2) + return; + + PFM_DBG("global_ctrl restored to 0x%llx\n", + __get_cpu_var(saved_global_ctrl)); + + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, __get_cpu_var(saved_global_ctrl)); +} + +/* + * Counters may have model-specific width. Yet the documentation says + * that only the lower 32 bits can be written to due to the specification + * of wrmsr. bits [32-(w-1)] are sign extensions of bit 31. Bits [w-63] must + * not be set (see rsvd_msk for PMDs). As such the effective width of a + * counter is 31 bits only regardless of what CPUID.0xa returns. + * + * See IA-32 Intel Architecture Software developer manual Vol 3B chapter 18 + */ +static struct pfm_pmu_config pfm_intel_arch_pmu_conf = { + .pmu_name = "Intel architectural", + .pmd_desc = pfm_intel_arch_pmd_desc, + .counter_width = 31, + .num_pmc_entries = PFM_IA_MAX_PMCS, + .num_pmd_entries = PFM_IA_MAX_PMDS, + .pmc_desc = pfm_intel_arch_pmc_desc, + .version = "1.0", + .pmu_info = &pfm_intel_arch_pmu_info +}; + +static int __init pfm_intel_arch_pmu_init_module(void) +{ + if (pfm_intel_arch_probe_pmu()) + return -ENOSYS; + + return pfm_pmu_register(&pfm_intel_arch_pmu_conf); +} + +device_initcall(pfm_intel_arch_pmu_init_module); |