summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/ia64/Kconfig2
-rw-r--r--arch/ia64/configs/bigsur_defconfig2
-rw-r--r--arch/ia64/configs/generic_defconfig2
-rw-r--r--arch/ia64/configs/gensparse_defconfig2
-rw-r--r--arch/ia64/configs/sim_defconfig2
-rw-r--r--arch/ia64/configs/tiger_defconfig2
-rw-r--r--arch/ia64/configs/zx1_defconfig2
-rw-r--r--arch/ia64/include/asm/processor.h2
-rw-r--r--arch/ia64/include/asm/system.h2
-rw-r--r--arch/ia64/kernel/Makefile2
-rw-r--r--arch/ia64/kernel/irq_ia64.c4
-rw-r--r--arch/ia64/kernel/perfmon.c6
-rw-r--r--arch/ia64/kernel/process.c16
-rw-r--r--arch/ia64/kernel/ptrace.c4
-rw-r--r--arch/ia64/kernel/smpboot.c4
-rw-r--r--arch/ia64/lib/Makefile2
-rw-r--r--arch/ia64/oprofile/Makefile2
-rw-r--r--arch/ia64/oprofile/init.c4
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/Makefile3
-rw-r--r--arch/x86/ia32/ia32entry.S5
-rw-r--r--arch/x86/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/irq_vectors.h5
-rw-r--r--arch/x86/include/asm/mach-default/entry_arch.h4
-rw-r--r--arch/x86/include/asm/perfmon.h34
-rw-r--r--arch/x86/include/asm/perfmon_kern.h438
-rw-r--r--arch/x86/include/asm/thread_info.h8
-rw-r--r--arch/x86/include/asm/unistd_32.h5
-rw-r--r--arch/x86/include/asm/unistd_64.h11
-rw-r--r--arch/x86/kernel/entry_32.S2
-rw-r--r--arch/x86/kernel/entry_64.S8
-rw-r--r--arch/x86/kernel/irqinit_64.c5
-rw-r--r--arch/x86/kernel/process_32.c10
-rw-r--r--arch/x86/kernel/process_64.c10
-rw-r--r--arch/x86/kernel/signal_32.c5
-rw-r--r--arch/x86/kernel/signal_64.c5
-rw-r--r--arch/x86/kernel/syscall_table_32.S5
-rw-r--r--arch/x86/oprofile/nmi_int.c10
-rw-r--r--arch/x86/perfmon/Kconfig33
-rw-r--r--arch/x86/perfmon/Makefile7
-rw-r--r--arch/x86/perfmon/perfmon.c619
-rw-r--r--arch/x86/perfmon/perfmon_amd64.c483
-rw-r--r--arch/x86/perfmon/perfmon_intel_arch.c628
43 files changed, 2371 insertions, 37 deletions
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 6bd91ed7cd03..ad604df6a2b6 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -505,7 +505,7 @@ config COMPAT_FOR_U64_ALIGNMENT
config IA64_MCA_RECOVERY
tristate "MCA recovery from errors other than TLB."
-config PERFMON
+config PERFMON_V20
bool "Performance monitor support"
help
Selects whether support for the IA-64 performance monitor hardware
diff --git a/arch/ia64/configs/bigsur_defconfig b/arch/ia64/configs/bigsur_defconfig
index 6dd8655664f3..2c04fbe6c414 100644
--- a/arch/ia64/configs/bigsur_defconfig
+++ b/arch/ia64/configs/bigsur_defconfig
@@ -134,7 +134,7 @@ CONFIG_ARCH_SPARSEMEM_ENABLE=y
CONFIG_IA32_SUPPORT=y
CONFIG_COMPAT=y
# CONFIG_IA64_MCA_RECOVERY is not set
-CONFIG_PERFMON=y
+CONFIG_PERFMON_V20=y
CONFIG_IA64_PALINFO=y
#
diff --git a/arch/ia64/configs/generic_defconfig b/arch/ia64/configs/generic_defconfig
index e05f9e1d3faa..7d89a19fc8b3 100644
--- a/arch/ia64/configs/generic_defconfig
+++ b/arch/ia64/configs/generic_defconfig
@@ -209,7 +209,7 @@ CONFIG_IA32_SUPPORT=y
CONFIG_COMPAT=y
CONFIG_COMPAT_FOR_U64_ALIGNMENT=y
CONFIG_IA64_MCA_RECOVERY=y
-CONFIG_PERFMON=y
+CONFIG_PERFMON_V20=y
CONFIG_IA64_PALINFO=y
# CONFIG_IA64_MC_ERR_INJECT is not set
CONFIG_SGI_SN=y
diff --git a/arch/ia64/configs/gensparse_defconfig b/arch/ia64/configs/gensparse_defconfig
index e86fbd39c795..5f8c7721e29a 100644
--- a/arch/ia64/configs/gensparse_defconfig
+++ b/arch/ia64/configs/gensparse_defconfig
@@ -142,7 +142,7 @@ CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
CONFIG_IA32_SUPPORT=y
CONFIG_COMPAT=y
CONFIG_IA64_MCA_RECOVERY=y
-CONFIG_PERFMON=y
+CONFIG_PERFMON_V20=y
CONFIG_IA64_PALINFO=y
CONFIG_SGI_SN=y
diff --git a/arch/ia64/configs/sim_defconfig b/arch/ia64/configs/sim_defconfig
index 546a772f438e..d51457af7ca6 100644
--- a/arch/ia64/configs/sim_defconfig
+++ b/arch/ia64/configs/sim_defconfig
@@ -133,7 +133,7 @@ CONFIG_ARCH_SPARSEMEM_ENABLE=y
CONFIG_IA32_SUPPORT=y
CONFIG_COMPAT=y
# CONFIG_IA64_MCA_RECOVERY is not set
-# CONFIG_PERFMON is not set
+# CONFIG_PERFMON_V20 is not set
CONFIG_IA64_PALINFO=m
#
diff --git a/arch/ia64/configs/tiger_defconfig b/arch/ia64/configs/tiger_defconfig
index c522edf23c62..318d846ab253 100644
--- a/arch/ia64/configs/tiger_defconfig
+++ b/arch/ia64/configs/tiger_defconfig
@@ -156,7 +156,7 @@ CONFIG_VIRTUAL_MEM_MAP=y
CONFIG_HOLES_IN_ZONE=y
# CONFIG_IA32_SUPPORT is not set
CONFIG_IA64_MCA_RECOVERY=y
-CONFIG_PERFMON=y
+CONFIG_PERFMON_V20=y
CONFIG_IA64_PALINFO=y
# CONFIG_IA64_MC_ERR_INJECT is not set
# CONFIG_IA64_ESI is not set
diff --git a/arch/ia64/configs/zx1_defconfig b/arch/ia64/configs/zx1_defconfig
index 0a06b1333c95..2bf0ad40398f 100644
--- a/arch/ia64/configs/zx1_defconfig
+++ b/arch/ia64/configs/zx1_defconfig
@@ -153,7 +153,7 @@ CONFIG_HOLES_IN_ZONE=y
CONFIG_IA32_SUPPORT=y
CONFIG_COMPAT=y
CONFIG_IA64_MCA_RECOVERY=y
-CONFIG_PERFMON=y
+CONFIG_PERFMON_V20=y
CONFIG_IA64_PALINFO=y
# CONFIG_IA64_ESI is not set
# CONFIG_KEXEC is not set
diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h
index f88fa054d01d..3ecf7e0b44cb 100644
--- a/arch/ia64/include/asm/processor.h
+++ b/arch/ia64/include/asm/processor.h
@@ -321,7 +321,7 @@ struct thread_struct {
#else
# define INIT_THREAD_IA32
#endif /* CONFIG_IA32_SUPPORT */
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
void *pfm_context; /* pointer to detailed PMU context */
unsigned long pfm_needs_checking; /* when >0, pending perfmon work on kernel exit */
# define INIT_THREAD_PM .pfm_context = NULL, \
diff --git a/arch/ia64/include/asm/system.h b/arch/ia64/include/asm/system.h
index 927a381c20ca..387e54030af1 100644
--- a/arch/ia64/include/asm/system.h
+++ b/arch/ia64/include/asm/system.h
@@ -224,7 +224,7 @@ extern void ia64_account_on_switch (struct task_struct *prev, struct task_struct
# define IA64_ACCOUNT_ON_SWITCH(p,n)
#endif
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
DECLARE_PER_CPU(unsigned long, pfm_syst_info);
# define PERFMON_IS_SYSWIDE() (__get_cpu_var(pfm_syst_info) & 0x1)
#else
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index c381ea954892..93819cca7d96 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -23,7 +23,7 @@ obj-$(CONFIG_IOSAPIC) += iosapic.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_SMP) += smp.o smpboot.o
obj-$(CONFIG_NUMA) += numa.o
-obj-$(CONFIG_PERFMON) += perfmon_default_smpl.o
+obj-$(CONFIG_PERFMON_V20) += perfmon_default_smpl.o
obj-$(CONFIG_IA64_CYCLONE) += cyclone.o
obj-$(CONFIG_CPU_FREQ) += cpufreq/
obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o
diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c
index 28d3d483db92..db54bd497cf6 100644
--- a/arch/ia64/kernel/irq_ia64.c
+++ b/arch/ia64/kernel/irq_ia64.c
@@ -40,7 +40,7 @@
#include <asm/system.h>
#include <asm/tlbflush.h>
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
# include <asm/perfmon.h>
#endif
@@ -660,7 +660,7 @@ init_IRQ (void)
}
#endif
#endif
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
pfm_init_percpu();
#endif
platform_irq_init();
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 0e499757309b..5f6efcfa2de4 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -52,7 +52,7 @@
#include <asm/uaccess.h>
#include <asm/delay.h>
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
/*
* perfmon context state
*/
@@ -6831,10 +6831,10 @@ pfm_inherit(struct task_struct *task, struct pt_regs *regs)
* the psr bits are already set properly in copy_threads()
*/
}
-#else /* !CONFIG_PERFMON */
+#else /* !CONFIG_PERFMON_v20 */
asmlinkage long
sys_perfmonctl (int fd, int cmd, void *arg, int count)
{
return -ENOSYS;
}
-#endif /* CONFIG_PERFMON */
+#endif /* CONFIG_PERFMON_V20 */
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index c57162705147..afbf1a8205ee 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -46,7 +46,7 @@
#include "entry.h"
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
# include <asm/perfmon.h>
#endif
@@ -174,7 +174,7 @@ do_notify_resume_user(sigset_t *unused, struct sigscratch *scr, long in_syscall)
return;
}
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
if (current->thread.pfm_needs_checking)
/*
* Note: pfm_handle_work() allow us to call it with interrupts
@@ -334,14 +334,14 @@ cpu_idle (void)
void
ia64_save_extra (struct task_struct *task)
{
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
unsigned long info;
#endif
if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0)
ia64_save_debug_regs(&task->thread.dbr[0]);
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0)
pfm_save_regs(task);
@@ -359,14 +359,14 @@ ia64_save_extra (struct task_struct *task)
void
ia64_load_extra (struct task_struct *task)
{
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
unsigned long info;
#endif
if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0)
ia64_load_debug_regs(&task->thread.dbr[0]);
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0)
pfm_load_regs(task);
@@ -523,7 +523,7 @@ copy_thread (int nr, unsigned long clone_flags,
}
#endif
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
if (current->thread.pfm_context)
pfm_inherit(p, child_ptregs);
#endif
@@ -735,7 +735,7 @@ exit_thread (void)
{
ia64_drop_fpu(current);
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
/* if needed, stop monitoring and flush state to perfmon context */
if (current->thread.pfm_context)
pfm_exit_thread(current);
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 92c9689b7d97..ffd212fd2d36 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -31,7 +31,7 @@
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/unwind.h>
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
#include <asm/perfmon.h>
#endif
@@ -2105,7 +2105,7 @@ access_uarea(struct task_struct *child, unsigned long addr,
"address 0x%lx\n", addr);
return -1;
}
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
/*
* Check if debug registers are used by perfmon. This
* test must be done once we know that we can do the
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 1dcbb85fc4ee..f865315a9248 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -381,7 +381,7 @@ smp_callin (void)
extern void ia64_init_itm(void);
extern volatile int time_keeper_id;
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
extern void pfm_init_percpu(void);
#endif
@@ -411,7 +411,7 @@ smp_callin (void)
ia64_mca_cmc_vector_setup(); /* Setup vector on AP */
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
pfm_init_percpu();
#endif
diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile
index 98771e2a78af..754f4153123e 100644
--- a/arch/ia64/lib/Makefile
+++ b/arch/ia64/lib/Makefile
@@ -13,7 +13,7 @@ lib-y := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \
obj-$(CONFIG_ITANIUM) += copy_page.o copy_user.o memcpy.o
obj-$(CONFIG_MCKINLEY) += copy_page_mck.o memcpy_mck.o
-lib-$(CONFIG_PERFMON) += carta_random.o
+lib-$(CONFIG_PERFMON_V20) += carta_random.o
AFLAGS___divdi3.o =
AFLAGS___udivdi3.o = -DUNSIGNED
diff --git a/arch/ia64/oprofile/Makefile b/arch/ia64/oprofile/Makefile
index aad27a718ee0..3323fd5a46e9 100644
--- a/arch/ia64/oprofile/Makefile
+++ b/arch/ia64/oprofile/Makefile
@@ -7,4 +7,4 @@ DRIVER_OBJS := $(addprefix ../../../drivers/oprofile/, \
timer_int.o )
oprofile-y := $(DRIVER_OBJS) init.o backtrace.o
-oprofile-$(CONFIG_PERFMON) += perfmon.o
+oprofile-$(CONFIG_PERFMON_V20) += perfmon.o
diff --git a/arch/ia64/oprofile/init.c b/arch/ia64/oprofile/init.c
index 31b545c35460..9ed2bc152fba 100644
--- a/arch/ia64/oprofile/init.c
+++ b/arch/ia64/oprofile/init.c
@@ -20,7 +20,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops)
{
int ret = -ENODEV;
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
/* perfmon_init() can fail, but we have no way to report it */
ret = perfmon_init(ops);
#endif
@@ -32,7 +32,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops)
void oprofile_arch_exit(void)
{
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
perfmon_exit();
#endif
}
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b5e714373385..cdc53491c033 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1516,6 +1516,8 @@ config CMDLINE_OVERRIDE
This is used to work around broken boot loaders. This should
be set to 'N' under normal conditions.
+source "arch/x86/perfmon/Kconfig"
+
endmenu
config ARCH_ENABLE_MEMORY_HOTPLUG
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index cf72b569db41..f3af2b0b4f15 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -155,6 +155,9 @@ core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
core-y += arch/x86/kernel/
core-y += arch/x86/mm/
+# perfmon support
+core-$(CONFIG_PERFMON) += arch/x86/perfmon/
+
# Remaining sub architecture files
core-y += $(mcore-y)
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 256b00b61892..891af3e6b3a6 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -826,4 +826,9 @@ ia32_sys_call_table:
.quad sys_dup3 /* 330 */
.quad sys_pipe2
.quad sys_inotify_init1
+ .quad sys_pfm_create
+ .quad sys_pfm_write
+ .quad sys_pfm_read /* 335 */
+ .quad sys_pfm_attach
+ .quad sys_pfm_set_state
ia32_syscall_end:
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 4a8e80cdcfa5..15d495f73485 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -10,6 +10,7 @@ header-y += ptrace-abi.h
header-y += sigcontext32.h
header-y += ucontext.h
header-y += processor-flags.h
+header-y += perfmon.h
unifdef-y += e820.h
unifdef-y += ist.h
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 0005adb0f941..0ba6dd3aa24e 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -87,6 +87,11 @@
#define LOCAL_TIMER_VECTOR 0xef
/*
+ * Perfmon PMU interrupt vector
+ */
+#define LOCAL_PERFMON_VECTOR 0xee
+
+/*
* First APIC vector available to drivers: (vectors 0x30-0xee) we
* start at 0x31(0x41) to spread out vectors evenly between priority
* levels. (0x80 is the syscall vector)
diff --git a/arch/x86/include/asm/mach-default/entry_arch.h b/arch/x86/include/asm/mach-default/entry_arch.h
index 6b1add8e31dd..e940722dc1f0 100644
--- a/arch/x86/include/asm/mach-default/entry_arch.h
+++ b/arch/x86/include/asm/mach-default/entry_arch.h
@@ -33,4 +33,8 @@ BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
#endif
+#ifdef CONFIG_PERFMON
+BUILD_INTERRUPT(pmu_interrupt,LOCAL_PERFMON_VECTOR)
+#endif
+
#endif
diff --git a/arch/x86/include/asm/perfmon.h b/arch/x86/include/asm/perfmon.h
new file mode 100644
index 000000000000..906f4b24cf0c
--- /dev/null
+++ b/arch/x86/include/asm/perfmon.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This file contains i386/x86_64 specific definitions for the perfmon
+ * interface.
+ *
+ * This file MUST never be included directly. Use linux/perfmon.h.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#ifndef _ASM_X86_PERFMON__H_
+#define _ASM_X86_PERFMON__H_
+
+/*
+ * arch-specific user visible interface definitions
+ */
+
+#define PFM_ARCH_MAX_PMCS (256+64) /* 256 HW 64 SW */
+#define PFM_ARCH_MAX_PMDS (256+64) /* 256 HW 64 SW */
+
+#endif /* _ASM_X86_PERFMON_H_ */
diff --git a/arch/x86/include/asm/perfmon_kern.h b/arch/x86/include/asm/perfmon_kern.h
new file mode 100644
index 000000000000..7cadbb894e83
--- /dev/null
+++ b/arch/x86/include/asm/perfmon_kern.h
@@ -0,0 +1,438 @@
+/*
+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Copyright (c) 2007 Advanced Micro Devices, Inc.
+ * Contributed by Robert Richter <robert.richter@amd.com>
+ *
+ * This file contains X86 Processor Family specific definitions
+ * for the perfmon interface. This covers P6, Pentium M, P4/Xeon
+ * (32-bit and 64-bit, i.e., EM64T) and AMD X86-64.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#ifndef _ASM_X86_PERFMON_KERN_H_
+#define _ASM_X86_PERFMON_KERN_H_
+
+#ifdef CONFIG_PERFMON
+#include <linux/unistd.h>
+#ifdef CONFIG_4KSTACKS
+#define PFM_ARCH_STK_ARG 8
+#else
+#define PFM_ARCH_STK_ARG 16
+#endif
+
+struct pfm_arch_pmu_info {
+ u32 flags; /* PMU feature flags */
+ /*
+ * mandatory model-specific callbacks
+ */
+ int (*stop_save)(struct pfm_context *ctx, struct pfm_event_set *set);
+ int (*has_ovfls)(struct pfm_context *ctx);
+ void (*quiesce)(void);
+
+ /*
+ * optional model-specific callbacks
+ */
+ void (*acquire_pmu_percpu)(void);
+ void (*release_pmu_percpu)(void);
+ int (*load_context)(struct pfm_context *ctx);
+ void (*unload_context)(struct pfm_context *ctx);
+};
+
+/*
+ * PMU feature flags
+ */
+#define PFM_X86_FL_NO_SHARING 0x02 /* no sharing with other subsystems */
+#define PFM_X86_FL_SHARING 0x04 /* PMU is being shared */
+
+struct pfm_x86_ctx_flags {
+ unsigned int insecure:1; /* rdpmc per-thread self-monitoring */
+ unsigned int reserved:31; /* for future use */
+};
+
+struct pfm_arch_context {
+ u64 saved_real_iip; /* instr pointer of last NMI intr */
+ struct pfm_x86_ctx_flags flags; /* flags */
+ int saved_started;
+};
+
+/*
+ * functions implemented as inline on x86
+ */
+
+/**
+ * pfm_arch_write_pmc - write a single PMC register
+ * @ctx: context to work on
+ * @cnum: PMC index
+ * @value: PMC 64-bit value
+ *
+ * in certain situations, ctx may be NULL
+ */
+static inline void pfm_arch_write_pmc(struct pfm_context *ctx,
+ unsigned int cnum, u64 value)
+{
+ /*
+ * we only write to the actual register when monitoring is
+ * active (pfm_start was issued)
+ */
+ if (ctx && ctx->flags.started == 0)
+ return;
+
+ PFM_DBG_ovfl("pfm_arch_write_pmc(0x%lx, 0x%Lx)",
+ pfm_pmu_conf->pmc_desc[cnum].hw_addr,
+ (unsigned long long) value);
+
+ wrmsrl(pfm_pmu_conf->pmc_desc[cnum].hw_addr, value);
+}
+
+/**
+ * pfm_arch_write_pmd - write a single PMD register
+ * @ctx: context to work on
+ * @cnum: PMD index
+ * @value: PMD 64-bit value
+ */
+static inline void pfm_arch_write_pmd(struct pfm_context *ctx,
+ unsigned int cnum, u64 value)
+{
+ /*
+ * to make sure the counter overflows, we set the
+ * upper bits. we also clear any other unimplemented
+ * bits as this may cause crash on some processors.
+ */
+ if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_C64)
+ value = (value | ~pfm_pmu_conf->ovfl_mask)
+ & ~pfm_pmu_conf->pmd_desc[cnum].rsvd_msk;
+
+ PFM_DBG_ovfl("pfm_arch_write_pmd(0x%lx, 0x%Lx)",
+ pfm_pmu_conf->pmd_desc[cnum].hw_addr,
+ (unsigned long long) value);
+
+ wrmsrl(pfm_pmu_conf->pmd_desc[cnum].hw_addr, value);
+}
+
+/**
+ * pfm_arch_read_pmd - read a single PMD register
+ * @ctx: context to work on
+ * @cnum: PMD index
+ *
+ * return value is register 64-bit value
+ */
+static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum)
+{
+ u64 tmp;
+
+ rdmsrl(pfm_pmu_conf->pmd_desc[cnum].hw_addr, tmp);
+
+ PFM_DBG_ovfl("pfm_arch_read_pmd(0x%lx) = 0x%Lx",
+ pfm_pmu_conf->pmd_desc[cnum].hw_addr,
+ (unsigned long long) tmp);
+ return tmp;
+}
+
+/**
+ * pfm_arch_read_pmc - read a single PMC register
+ * @ctx: context to work on
+ * @cnum: PMC index
+ *
+ * return value is register 64-bit value
+ */
+static inline u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum)
+{
+ u64 tmp;
+
+ rdmsrl(pfm_pmu_conf->pmc_desc[cnum].hw_addr, tmp);
+
+ PFM_DBG_ovfl("pfm_arch_read_pmc(0x%lx) = 0x%016Lx",
+ pfm_pmu_conf->pmc_desc[cnum].hw_addr,
+ (unsigned long long) tmp);
+ return tmp;
+}
+
+/**
+ * pfm_arch_is_active - return non-zero is monitoring has been started
+ * @ctx: context to check
+ *
+ * At certain points, perfmon needs to know if monitoring has been
+ * explicitly started.
+ *
+ * On x86, there is not other way but to use pfm_start/pfm_stop
+ * to activate monitoring, thus we can simply check flags.started
+ */
+static inline int pfm_arch_is_active(struct pfm_context *ctx)
+{
+ return ctx->flags.started;
+}
+
+
+/**
+ * pfm_arch_unload_context - detach context from thread or CPU
+ * @ctx: context to detach
+ *
+ * in system-wide ctx->task is NULL, otherwise it points to the
+ * attached thread
+ */
+static inline void pfm_arch_unload_context(struct pfm_context *ctx)
+{
+ struct pfm_arch_pmu_info *pmu_info;
+ struct pfm_arch_context *ctx_arch;
+
+ ctx_arch = pfm_ctx_arch(ctx);
+ pmu_info = pfm_pmu_info();
+
+ if (ctx_arch->flags.insecure) {
+ PFM_DBG("clear cr4.pce");
+ clear_in_cr4(X86_CR4_PCE);
+ }
+
+ if (pmu_info->unload_context)
+ pmu_info->unload_context(ctx);
+}
+
+/**
+ * pfm_arch_load_context - attach context to thread or CPU
+ * @ctx: context to attach
+ */
+static inline int pfm_arch_load_context(struct pfm_context *ctx)
+{
+ struct pfm_arch_pmu_info *pmu_info;
+ struct pfm_arch_context *ctx_arch;
+ int ret = 0;
+
+ ctx_arch = pfm_ctx_arch(ctx);
+ pmu_info = pfm_pmu_info();
+
+ /*
+ * RDPMC authorized in system-wide and
+ * per-thread self-monitoring.
+ *
+ * RDPMC only gives access to counts.
+ *
+ * The context-switch routine code does not restore
+ * all the PMD registers (optimization), thus there
+ * is a possible leak of counts there in per-thread
+ * mode.
+ */
+ if (ctx->task == current) {
+ PFM_DBG("set cr4.pce");
+ set_in_cr4(X86_CR4_PCE);
+ ctx_arch->flags.insecure = 1;
+ }
+
+ if (pmu_info->load_context)
+ ret = pmu_info->load_context(ctx);
+
+ return ret;
+}
+
+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set);
+void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx);
+void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx);
+
+/**
+ * pfm_arch_intr_freeze_pmu - stop monitoring when handling PMU interrupt
+ * @ctx: current context
+ * @set: current event set
+ *
+ * called from __pfm_interrupt_handler().
+ * ctx is not NULL. ctx is locked. interrupts are masked
+ *
+ * The following actions must take place:
+ * - stop all monitoring to ensure handler has consistent view.
+ * - collect overflowed PMDs bitmask into povfls_pmds and
+ * npend_ovfls. If no interrupt detected then npend_ovfls
+ * must be set to zero.
+ */
+static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx,
+ struct pfm_event_set *set)
+{
+ struct pfm_arch_context *ctx_arch;
+ ctx_arch = pfm_ctx_arch(ctx);
+ /*
+ * on X86, freezing is equivalent to stopping
+ */
+ pfm_arch_stop(current, ctx);
+
+ /*
+ * we mark monitoring as stopped to avoid
+ * certain side effects especially in
+ * pfm_arch_restore_pmcs()
+ */
+ ctx_arch->saved_started = ctx->flags.started;
+ ctx->flags.started = 0;
+}
+
+/**
+ * pfm_arch_intr_unfreeze_pmu - conditionally reactive monitoring
+ * @ctx: current context
+ *
+ * current context may be not when dealing when spurious interrupts
+ *
+ * Must re-activate monitoring if context is not MASKED.
+ * interrupts are masked.
+ */
+static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx)
+{
+ struct pfm_arch_context *ctx_arch;
+
+ if (ctx == NULL)
+ return;
+
+ ctx_arch = pfm_ctx_arch(ctx);
+
+ PFM_DBG_ovfl("state=%d", ctx->state);
+
+ /*
+ * restore flags.started which is cleared in
+ * pfm_arch_intr_freeze_pmu()
+ */
+ ctx->flags.started = ctx_arch->saved_started;
+
+ pfm_arch_restore_pmcs(ctx, ctx->active_set);
+}
+
+/**
+ * pfm_arch_ovfl_reset_pmd - reset pmd on overflow
+ * @ctx: current context
+ * @cnum: PMD index
+ *
+ * On some CPUs, the upper bits of a counter must be set in order for the
+ * overflow interrupt to happen. On overflow, the counter has wrapped around,
+ * and the upper bits are cleared. This function may be used to set them back.
+ *
+ * For x86, the current version loses whatever is remaining in the counter,
+ * which is usually has a small count. In order not to loose this count,
+ * we do a read-modify-write to set the upper bits while preserving the
+ * low-order bits. This is slow but works.
+ */
+static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx, unsigned int cnum)
+{
+ u64 val;
+ val = pfm_arch_read_pmd(ctx, cnum);
+ pfm_arch_write_pmd(ctx, cnum, val);
+}
+
+/**
+ * pfm_arch_context_create - create context
+ * @ctx: newly created context
+ * @flags: context flags as passed by user
+ *
+ * called from __pfm_create_context()
+ */
+static inline int pfm_arch_context_create(struct pfm_context *ctx, u32 ctx_flags)
+{
+ return 0;
+}
+
+/**
+ * pfm_arch_context_free - free context
+ * @ctx: context to free
+ */
+static inline void pfm_arch_context_free(struct pfm_context *ctx)
+{}
+
+/*
+ * functions implemented in arch/x86/perfmon/perfmon.c
+ */
+int pfm_arch_init(void);
+void pfm_arch_resend_irq(struct pfm_context *ctx);
+
+int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx);
+void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx);
+
+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set);
+int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg);
+void pfm_arch_pmu_config_remove(void);
+char *pfm_arch_get_pmu_module_name(void);
+int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds);
+void pfm_arch_pmu_release(void);
+
+static inline void pfm_arch_serialize(void)
+{}
+
+static inline void pfm_arch_arm_handle_work(struct task_struct *task)
+{}
+
+static inline void pfm_arch_disarm_handle_work(struct task_struct *task)
+{}
+
+#define PFM_ARCH_CTX_SIZE (sizeof(struct pfm_arch_context))
+/*
+ * x86 does not need extra alignment requirements for the sampling buffer
+ */
+#define PFM_ARCH_SMPL_ALIGN_SIZE 0
+
+asmlinkage void pmu_interrupt(void);
+
+static inline void pfm_arch_bv_copy(u64 *a, u64 *b, int nbits)
+{
+ bitmap_copy((unsigned long *)a,
+ (unsigned long *)b,
+ nbits);
+}
+
+static inline void pfm_arch_bv_or(u64 *a, u64 *b, u64 *c, int nbits)
+{
+ bitmap_or((unsigned long *)a,
+ (unsigned long *)b,
+ (unsigned long *)c,
+ nbits);
+}
+
+static inline void pfm_arch_bv_and(u64 *a, u64 *b, u64 *c, int nbits)
+{
+ bitmap_and((unsigned long *)a,
+ (unsigned long *)b,
+ (unsigned long *)c,
+ nbits);
+}
+
+
+static inline void pfm_arch_bv_zero(u64 *a, int nbits)
+{
+ bitmap_zero((unsigned long *)a, nbits);
+}
+
+static inline int pfm_arch_bv_weight(u64 *a, int nbits)
+{
+ return bitmap_weight((unsigned long *)a, nbits);
+}
+
+static inline void pfm_arch_bv_set_bit(int b, u64 *a)
+{
+ __set_bit(b, (unsigned long *)a);
+}
+
+static inline void pfm_arch_bv_clear_bit(int b, u64 *a)
+{
+ __clear_bit(b, (unsigned long *)a);
+}
+
+static inline int pfm_arch_bv_test_bit(int b, u64 *a)
+{
+ return test_bit(b, (unsigned long *)a);
+}
+
+static inline unsigned long pfm_arch_bv_find_next_bit(const u64 *addr,
+ unsigned long size,
+ unsigned long offset)
+{
+ return find_next_bit((unsigned long *)addr,
+ size,
+ offset);
+}
+#endif /* CONFIG_PEFMON */
+
+#endif /* _ASM_X86_PERFMON_KERN_H_ */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e44d379faad2..0ddd534bef44 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -79,6 +79,7 @@ struct thread_info {
#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
+#define TIF_PERFMON_WORK 9 /* work for pfm_handle_work() */
#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
#define TIF_IA32 17 /* 32bit process */
@@ -92,6 +93,7 @@ struct thread_info {
#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
#define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */
+#define TIF_PERFMON_CTXSW 28 /* perfmon needs ctxsw calls */
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -114,6 +116,8 @@ struct thread_info {
#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
#define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS)
+#define _TIF_PERFMON_WORK (1<<TIF_PERFMON_WORK)
+#define _TIF_PERFMON_CTXSW (1<<TIF_PERFMON_CTXSW)
/* work to do in syscall_trace_enter() */
#define _TIF_WORK_SYSCALL_ENTRY \
@@ -135,12 +139,12 @@ struct thread_info {
/* Only used for 64 bit */
#define _TIF_DO_NOTIFY_MASK \
- (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
+ (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERFMON_WORK)
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \
- _TIF_NOTSC)
+ _TIF_NOTSC|_TIF_PERFMON_CTXSW)
#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index f2bba78430a4..06908451002f 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -338,6 +338,11 @@
#define __NR_dup3 330
#define __NR_pipe2 331
#define __NR_inotify_init1 332
+#define __NR_pfm_create 333
+#define __NR_pfm_write (__NR_pfm_create+1)
+#define __NR_pfm_read (__NR_pfm_create+2)
+#define __NR_pfm_attach (__NR_pfm_create+3)
+#define __NR_pfm_set_state (__NR_pfm_create+4)
#ifdef __KERNEL__
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 834b2c1d89fb..a42bb5eb9edb 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -653,7 +653,16 @@ __SYSCALL(__NR_dup3, sys_dup3)
__SYSCALL(__NR_pipe2, sys_pipe2)
#define __NR_inotify_init1 294
__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
-
+#define __NR_pfm_create 295
+__SYSCALL(__NR_pfm_create, sys_pfm_create)
+#define __NR_pfm_write (__NR_pfm_create+1)
+__SYSCALL(__NR_pfm_write, sys_pfm_write)
+#define __NR_pfm_read (__NR_pfm_create+2)
+ __SYSCALL(__NR_pfm_read, sys_pfm_read)
+#define __NR_pfm_attach (__NR_pfm_create+3)
+__SYSCALL(__NR_pfm_attach, sys_pfm_attach)
+#define __NR_pfm_set_state (__NR_pfm_create+4)
+__SYSCALL(__NR_pfm_set_state, sys_pfm_set_state)
#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 9134de814c97..9f8826f33032 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -513,7 +513,7 @@ ENDPROC(system_call)
ALIGN
RING0_PTREGS_FRAME # can't unwind into user space anyway
work_pending:
- testb $_TIF_NEED_RESCHED, %cl
+ testw $(_TIF_NEED_RESCHED|_TIF_PERFMON_WORK), %cx
jz work_notifysig
work_resched:
call schedule
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 983d85aeccce..1d9bef0797d9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -876,7 +876,13 @@ END(error_interrupt)
ENTRY(spurious_interrupt)
apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
END(spurious_interrupt)
-
+
+#ifdef CONFIG_PERFMON
+ENTRY(pmu_interrupt)
+ apicinterrupt LOCAL_PERFMON_VECTOR,smp_pmu_interrupt
+END(pmu_interrupt)
+#endif
+
/*
* Exception entry points.
*/
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index ff0235391285..24a0140e6c36 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -11,6 +11,7 @@
#include <linux/kernel_stat.h>
#include <linux/sysdev.h>
#include <linux/bitops.h>
+#include <linux/perfmon_kern.h>
#include <asm/acpi.h>
#include <asm/atomic.h>
@@ -224,6 +225,10 @@ void __init native_init_IRQ(void)
apic_intr_init();
+#ifdef CONFIG_PERFMON
+ alloc_intr_gate(LOCAL_PERFMON_VECTOR, pmu_interrupt);
+#endif
+
if (!acpi_ioapic)
setup_irq(2, &irq2);
}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0a1302fe6d45..7ff71d4d6d9b 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -36,6 +36,7 @@
#include <linux/personality.h>
#include <linux/tick.h>
#include <linux/percpu.h>
+#include <linux/perfmon_kern.h>
#include <linux/prctl.h>
#include <linux/dmi.h>
@@ -258,6 +259,7 @@ void exit_thread(void)
ds_free(current->thread.ds_ctx);
}
#endif /* CONFIG_X86_DS */
+ pfm_exit_thread();
}
void flush_thread(void)
@@ -315,6 +317,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
savesegment(gs, p->thread.gs);
+ pfm_copy_thread(p);
+
tsk = current;
if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
@@ -458,11 +462,17 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
prev = &prev_p->thread;
next = &next_p->thread;
+ if (test_tsk_thread_flag(prev_p, TIF_PERFMON_CTXSW))
+ pfm_ctxsw_out(prev_p, next_p);
+
debugctl = update_debugctl(prev, next, prev->debugctlmsr);
if (next->debugctlmsr != debugctl)
update_debugctlmsr(next->debugctlmsr);
+ if (test_tsk_thread_flag(next_p, TIF_PERFMON_CTXSW))
+ pfm_ctxsw_in(prev_p, next_p);
+
if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
set_debugreg(next->debugreg0, 0);
set_debugreg(next->debugreg1, 1);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3180e79c3697..86099f98104a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -37,6 +37,7 @@
#include <linux/kprobes.h>
#include <linux/kdebug.h>
#include <linux/tick.h>
+#include <linux/perfmon_kern.h>
#include <linux/prctl.h>
#include <linux/uaccess.h>
#include <linux/io.h>
@@ -255,6 +256,7 @@ void exit_thread(void)
ds_free(t->ds_ctx);
}
#endif /* CONFIG_X86_DS */
+ pfm_exit_thread();
}
void flush_thread(void)
@@ -359,6 +361,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
savesegment(es, p->thread.es);
savesegment(ds, p->thread.ds);
+ pfm_copy_thread(p);
+
if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
if (!p->thread.io_bitmap_ptr) {
@@ -487,6 +491,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
prev = &prev_p->thread,
next = &next_p->thread;
+ if (test_tsk_thread_flag(prev_p, TIF_PERFMON_CTXSW))
+ pfm_ctxsw_out(prev_p, next_p);
+
debugctl = prev->debugctlmsr;
#ifdef CONFIG_X86_DS
@@ -513,6 +520,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
if (next->debugctlmsr != debugctl)
update_debugctlmsr(next->debugctlmsr);
+ if (test_tsk_thread_flag(next_p, TIF_PERFMON_CTXSW))
+ pfm_ctxsw_in(prev_p, next_p);
+
if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
loaddebug(next, 0);
loaddebug(next, 1);
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 27a5c8174322..7d6fc603dea7 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -19,6 +19,7 @@
#include <linux/wait.h>
#include <linux/tracehook.h>
#include <linux/elf.h>
+#include <linux/perfmon_kern.h>
#include <linux/smp.h>
#include <linux/mm.h>
@@ -749,6 +750,10 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
mce_notify_user();
#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
+ /* process perfmon asynchronous work (e.g. block thread or reset) */
+ if (thread_info_flags & _TIF_PERFMON_WORK)
+ pfm_handle_work(regs);
+
/* deal with pending signal delivery */
if (thread_info_flags & _TIF_SIGPENDING)
do_signal(regs);
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index d2307e41fbdb..24e389836fc0 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -21,6 +21,7 @@
#include <linux/personality.h>
#include <linux/compiler.h>
#include <linux/uaccess.h>
+#include <linux/perfmon_kern.h>
#include <asm/processor.h>
#include <asm/ucontext.h>
@@ -538,6 +539,10 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
mce_notify_user();
#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
+ /* process perfmon asynchronous work (e.g. block thread or reset) */
+ if (thread_info_flags & _TIF_PERFMON_WORK)
+ pfm_handle_work(regs);
+
/* deal with pending signal delivery */
if (thread_info_flags & _TIF_SIGPENDING)
do_signal(regs);
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d44395ff34c3..81c22739f70b 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -332,3 +332,8 @@ ENTRY(sys_call_table)
.long sys_dup3 /* 330 */
.long sys_pipe2
.long sys_inotify_init1
+ .long sys_pfm_create
+ .long sys_pfm_write
+ .long sys_pfm_read /* 335 */
+ .long sys_pfm_attach
+ .long sys_pfm_set_state
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 022cd41ea9b4..584a9ef4e44c 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -17,6 +17,7 @@
#include <linux/moduleparam.h>
#include <linux/kdebug.h>
#include <linux/cpu.h>
+#include <linux/perfmon_kern.h>
#include <asm/nmi.h>
#include <asm/msr.h>
#include <asm/apic.h>
@@ -142,12 +143,18 @@ static int nmi_setup(void)
int err = 0;
int cpu;
- if (!allocate_msrs())
+ if (pfm_session_allcpus_acquire())
+ return -EBUSY;
+
+ if (!allocate_msrs()) {
+ pfm_session_allcpus_release();
return -ENOMEM;
+ }
err = register_die_notifier(&profile_exceptions_nb);
if (err) {
free_msrs();
+ pfm_session_allcpus_release();
return err;
}
@@ -228,6 +235,7 @@ static void nmi_shutdown(void)
msrs = &get_cpu_var(cpu_msrs);
model->shutdown(msrs);
free_msrs();
+ pfm_session_allcpus_release();
put_cpu_var(cpu_msrs);
}
diff --git a/arch/x86/perfmon/Kconfig b/arch/x86/perfmon/Kconfig
new file mode 100644
index 000000000000..8144d1d0d600
--- /dev/null
+++ b/arch/x86/perfmon/Kconfig
@@ -0,0 +1,33 @@
+menu "Hardware Performance Monitoring support"
+config PERFMON
+ bool "Perfmon2 performance monitoring interface"
+ select X86_LOCAL_APIC
+ default n
+ help
+ Enables the perfmon2 interface to access the hardware
+ performance counters. See <http://perfmon2.sf.net/> for
+ more details.
+
+config PERFMON_DEBUG
+ bool "Perfmon debugging"
+ default n
+ depends on PERFMON
+ help
+ Enables perfmon debugging support
+
+config X86_PERFMON_INTEL_ARCH
+ bool "Support for Intel architectural perfmon v1/v2/v3"
+ depends on PERFMON
+ default n
+ help
+ Enables support for Intel architectural performance counters.
+ This feature was introduced with Intel Core Solo/Core Duo processors.
+
+config X86_PERFMON_AMD64
+ bool "Support AMD Athlon/Opteron hardware performance counters"
+ depends on PERFMON
+ default n
+ help
+ Enables support for Athlon/Opterton hardware performance counters.
+ Support for family 6, 15 and 16 processors.
+ endmenu
diff --git a/arch/x86/perfmon/Makefile b/arch/x86/perfmon/Makefile
new file mode 100644
index 000000000000..c0a4ca0da329
--- /dev/null
+++ b/arch/x86/perfmon/Makefile
@@ -0,0 +1,7 @@
+#
+# Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P.
+# Contributed by Stephane Eranian <eranian@hpl.hp.com>
+#
+obj-$(CONFIG_PERFMON) += perfmon.o
+obj-$(CONFIG_X86_PERFMON_INTEL_ARCH) += perfmon_intel_arch.o
+obj-$(CONFIG_X86_PERFMON_AMD64) += perfmon_amd64.o
diff --git a/arch/x86/perfmon/perfmon.c b/arch/x86/perfmon/perfmon.c
new file mode 100644
index 000000000000..844f19dc6cb0
--- /dev/null
+++ b/arch/x86/perfmon/perfmon.c
@@ -0,0 +1,619 @@
+/*
+ * This file implements the X86 specific support for the perfmon2 interface
+ *
+ * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Copyright (c) 2007 Advanced Micro Devices, Inc.
+ * Contributed by Robert Richter <robert.richter@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/interrupt.h>
+#include <linux/perfmon_kern.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/nmi.h>
+
+#include <asm/apic.h>
+
+DEFINE_PER_CPU(unsigned long, real_iip);
+DEFINE_PER_CPU(int, pfm_using_nmi);
+
+/**
+ * pfm_arch_ctxswin_thread - thread context switch in
+ * @task: task switched in
+ * @ctx: context for the task
+ * @set: active event set
+ *
+ * Called from pfm_ctxsw(). Task is guaranteed to be current.
+ * set cannot be NULL. Context is locked. Interrupts are masked.
+ *
+ * Caller has already restored all PMD and PMC registers, if
+ * necessary (i.e., lazy restore scheme).
+ *
+ * On x86, the only common code just needs to unsecure RDPMC if necessary
+ *
+ * On model-specific features, e.g., PEBS, IBS, are taken care of in the
+ * corresponding PMU description module
+ */
+void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx)
+{
+ struct pfm_arch_context *ctx_arch;
+
+ ctx_arch = pfm_ctx_arch(ctx);
+
+ /*
+ * restore saved real iip
+ */
+ if (ctx->active_set->npend_ovfls)
+ __get_cpu_var(real_iip) = ctx_arch->saved_real_iip;
+
+ /*
+ * enable RDPMC on this CPU
+ */
+ if (ctx_arch->flags.insecure)
+ set_in_cr4(X86_CR4_PCE);
+}
+
+/**
+ * pfm_arch_ctxswout_thread - context switch out thread
+ * @task: task switched out
+ * @ctx : context switched out
+ *
+ * Called from pfm_ctxsw(). Task is guaranteed to be current.
+ * Context is locked. Interrupts are masked. Monitoring may be active.
+ * PMU access is guaranteed. PMC and PMD registers are live in PMU.
+ *
+ * Return:
+ * non-zero : did not save PMDs (as part of stopping the PMU)
+ * 0 : saved PMDs (no need to save them in caller)
+ */
+int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx)
+{
+ struct pfm_arch_context *ctx_arch;
+ struct pfm_arch_pmu_info *pmu_info;
+
+ ctx_arch = pfm_ctx_arch(ctx);
+ pmu_info = pfm_pmu_info();
+
+ /*
+ * disable lazy restore of PMCS on ctxswin because
+ * we modify some of them.
+ */
+ ctx->active_set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS;
+
+ if (ctx->active_set->npend_ovfls)
+ ctx_arch->saved_real_iip = __get_cpu_var(real_iip);
+
+ /*
+ * disable RDPMC on this CPU
+ */
+ if (ctx_arch->flags.insecure)
+ clear_in_cr4(X86_CR4_PCE);
+
+ return pmu_info->stop_save(ctx, ctx->active_set);
+}
+
+/**
+ * pfm_arch_stop - deactivate monitoring
+ * @task: task to stop
+ * @ctx: context to stop
+ *
+ * Called from pfm_stop()
+ * Interrupts are masked. Context is locked. Set is the active set.
+ *
+ * For per-thread:
+ * task is not necessarily current. If not current task, then
+ * task is guaranteed stopped and off any cpu. Access to PMU
+ * is not guaranteed.
+ *
+ * must disable active monitoring. ctx cannot be NULL
+ */
+void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx)
+{
+ struct pfm_arch_pmu_info *pmu_info;
+
+ pmu_info = pfm_pmu_info();
+
+ /*
+ * no need to go through stop_save()
+ * if we are already stopped
+ */
+ if (!ctx->flags.started)
+ return;
+
+ if (task != current)
+ return;
+
+ pmu_info->stop_save(ctx, ctx->active_set);
+}
+
+
+/**
+ * pfm_arch_start - activate monitoring
+ * @task: task to start
+ * @ctx: context to stop
+ *
+ * Interrupts are masked. Context is locked.
+ *
+ * For per-thread:
+ * Task is not necessarily current. If not current task, then task
+ * is guaranteed stopped and off any cpu. No access to PMU is task
+ * is not current.
+ */
+void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx)
+{
+ /*
+ * cannot restore PMC if no access to PMU. Will be done
+ * when the thread is switched back in
+ */
+ if (task != current)
+ return;
+
+ pfm_arch_restore_pmcs(ctx, ctx->active_set);
+}
+
+/**
+ * pfm_arch_restore_pmds - reload PMD registers
+ * @ctx: context to restore from
+ * @set: current event set
+ *
+ * function called from pfm_context_load(), pfm_ctxsw()
+ *
+ * Context is locked. Interrupts are masked. Set cannot be NULL.
+ * Access to the PMU is guaranteed.
+ */
+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set)
+{
+ u16 i, num;
+
+ num = set->nused_pmds;
+
+ /*
+ * we can restore only the PMD we use because:
+ *
+ * - can only read with pfm_read_pmds() the registers
+ * declared used via pfm_write_pmds()
+ *
+ * - if cr4.pce=1, only counters are exposed to user. RDPMC
+ * does not work with other types of PMU registers.Thus, no
+ * address is ever exposed by counters
+ *
+ * - there is never a dependency between one pmd register and
+ * another
+ */
+ for (i = 0; num; i++) {
+ if (likely(pfm_arch_bv_test_bit(i, set->used_pmds))) {
+ pfm_write_pmd(ctx, i, set->pmds[i]);
+ num--;
+ }
+ }
+}
+
+/**
+ * pfm_arch_restore_pmcs - reload PMC registers
+ * @ctx: context to restore from
+ * @set: current event set
+ *
+ * function called from pfm_context_load(), pfm_ctxsw().
+ *
+ * Context is locked. Interrupts are masked. set cannot be NULL.
+ * Access to the PMU is guaranteed.
+ *
+ * function must restore all PMC registers from set
+ */
+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set)
+{
+ u16 i, num;
+
+ /*
+ * we need to restore PMCs only when:
+ * - context is not masked
+ * - monitoring activated
+ *
+ * Masking monitoring after an overflow does not change the
+ * value of flags.started
+ */
+ if (!ctx->flags.started)
+ return;
+
+ /*
+ * restore all pmcs
+ *
+ * It is not possible to restore only the pmcs we used because
+ * certain PMU models (e.g. Pentium 4) have dependencies. Thus
+ * we do not want one application using stale PMCs coming from
+ * another one.
+ *
+ * On PMU models where there is no dependencies between PMCs, then
+ * it is possible to optimize by only restoring the registers that
+ * are used, but this has to be done by model-specific code.
+ */
+ num = ctx->regs.num_pmcs;
+ for (i = 0; num; i++) {
+ if (pfm_arch_bv_test_bit(i, ctx->regs.pmcs)) {
+ pfm_arch_write_pmc(ctx, i, set->pmcs[i]);
+ num--;
+ }
+ }
+}
+
+/**
+ * smp_pmu_interrupt - lowest level PMU interrupt handler for X86
+ * @regs: machine state
+ *
+ * The PMU interrupt is handled through an interrupt gate, therefore
+ * the CPU automatically clears the EFLAGS.IF, i.e., masking interrupts.
+ *
+ * The perfmon interrupt handler MUST run with interrupts disabled due
+ * to possible race with other, higher priority interrupts, such as timer
+ * or IPI function calls.
+ *
+ * See description in IA-32 architecture manual, Vol 3 section 5.8.1
+ */
+void smp_pmu_interrupt(struct pt_regs *regs)
+{
+ unsigned long iip;
+ int using_nmi;
+
+ using_nmi = __get_cpu_var(pfm_using_nmi);
+
+ ack_APIC_irq();
+
+ irq_enter();
+
+ /*
+ * when using NMI, pfm_handle_nmi() gets called
+ * first. It stops monitoring and record the
+ * iip into real_iip, then it repost the interrupt
+ * using the lower priority vector LOCAL_PERFMON_VECTOR
+ *
+ * On some processors, e.g., P4, it may be that some
+ * state is already recorded from pfm_handle_nmi()
+ * and it only needs to be copied back into the normal
+ * fields so it can be used transparently by higher level
+ * code.
+ */
+ if (using_nmi)
+ iip = __get_cpu_var(real_iip);
+ else
+ iip = instruction_pointer(regs);
+
+ pfm_interrupt_handler(iip, regs);
+
+ /*
+ * On Intel processors:
+ * - it is necessary to clear the MASK field for the LVTPC
+ * vector. Otherwise interrupts remain masked. See
+ * section 8.5.1
+ * AMD X86-64:
+ * - the documentation does not stipulate the behavior but
+ * it seems to work without the write, so we skip
+ */
+ if (!using_nmi && current_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ apic_write(APIC_LVTPC, LOCAL_PERFMON_VECTOR);
+
+ irq_exit();
+}
+
+/**
+ * pfm_handle_nmi - PMU NMI handler notifier callback
+ * @nb ; notifier block
+ * @val: type of die notifier
+ * @data: die notifier-specific data
+ *
+ * called from notify_die() notifier from an trap handler path. We only
+ * care about NMI related callbacks, and ignore everything else.
+ *
+ * Cannot grab any locks, include the perfmon context lock
+ *
+ * Must detect if NMI interrupt comes from perfmon, and if so it must
+ * stop the PMU and repost a lower-priority interrupt. The perfmon interrupt
+ * handler needs to grab the context lock, thus is cannot be run directly
+ * from the NMI interrupt call path.
+ */
+static int __kprobes pfm_handle_nmi(struct notifier_block *nb,
+ unsigned long val,
+ void *data)
+{
+ struct die_args *args = data;
+ struct pfm_context *ctx;
+ struct pfm_arch_pmu_info *pmu_info;
+
+ /*
+ * only NMI related calls
+ */
+ if (val != DIE_NMI_IPI)
+ return NOTIFY_DONE;
+
+ /*
+ * perfmon not using NMI
+ */
+ if (!__get_cpu_var(pfm_using_nmi))
+ return NOTIFY_DONE;
+
+ /*
+ * No context
+ */
+ ctx = __get_cpu_var(pmu_ctx);
+ if (!ctx) {
+ PFM_DBG_ovfl("no ctx");
+ return NOTIFY_DONE;
+ }
+
+ /*
+ * Detect if we have overflows, i.e., NMI interrupt
+ * caused by PMU
+ */
+ pmu_info = pfm_pmu_info();
+ if (!pmu_info->has_ovfls(ctx)) {
+ PFM_DBG_ovfl("no ovfl");
+ return NOTIFY_DONE;
+ }
+
+ /*
+ * we stop the PMU to avoid further overflow before this
+ * one is treated by lower priority interrupt handler
+ */
+ pmu_info->quiesce();
+
+ /*
+ * record actual instruction pointer
+ */
+ __get_cpu_var(real_iip) = instruction_pointer(args->regs);
+
+ /*
+ * post lower priority interrupt (LOCAL_PERFMON_VECTOR)
+ */
+ pfm_arch_resend_irq(ctx);
+
+ /*
+ * we need to rewrite the APIC vector on Intel
+ */
+ if (current_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+ /*
+ * the notification was for us
+ */
+ return NOTIFY_STOP;
+}
+
+static struct notifier_block pfm_nmi_nb = {
+ .notifier_call = pfm_handle_nmi
+};
+
+/**
+ * pfm_arch_resend_irq - post perfmon interrupt on regular vector
+ *
+ * called from pfm_ctxswin_thread() and pfm_handle_nmi()
+ */
+void pfm_arch_resend_irq(struct pfm_context *ctx)
+{
+ unsigned long val, dest;
+ /*
+ * we cannot use hw_resend_irq() because it goes to
+ * the I/O APIC. We need to go to the Local APIC.
+ *
+ * The "int vec" is not the right solution either
+ * because it triggers a software intr. We need
+ * to regenerate the interrupt and have it pended
+ * until we unmask interrupts.
+ *
+ * Instead we send ourself an IPI on the perfmon
+ * vector.
+ */
+ val = APIC_DEST_SELF|APIC_INT_ASSERT|
+ APIC_DM_FIXED|LOCAL_PERFMON_VECTOR;
+
+ dest = apic_read(APIC_ID);
+ apic_write(APIC_ICR2, dest);
+ apic_write(APIC_ICR, val);
+}
+
+/**
+ * pfm_arch_pmu_acquire_percpu - setup APIC per CPU
+ * @data: contains pmu flags
+ */
+static void pfm_arch_pmu_acquire_percpu(void *data)
+{
+ struct pfm_arch_pmu_info *pmu_info;
+ unsigned int tmp, vec;
+ unsigned long flags = (unsigned long)data;
+ unsigned long lvtpc;
+
+ pmu_info = pfm_pmu_conf->pmu_info;
+ /*
+ * we only reprogram the LVTPC vector if we have detected
+ * no sharing, otherwise it means the APIC is already programmed
+ * and we use whatever vector (likely NMI) is there
+ */
+ if (!(flags & PFM_X86_FL_SHARING)) {
+ vec = LOCAL_PERFMON_VECTOR;
+
+ tmp = apic_read(APIC_LVTERR);
+ apic_write(APIC_LVTERR, tmp | APIC_LVT_MASKED);
+ apic_write(APIC_LVTPC, vec);
+ apic_write(APIC_LVTERR, tmp);
+ }
+ lvtpc = (unsigned long)apic_read(APIC_LVTPC);
+
+ __get_cpu_var(pfm_using_nmi) = lvtpc == APIC_DM_NMI;
+
+ PFM_DBG("LTVPC=0x%lx using_nmi=%d",
+ lvtpc, __get_cpu_var(pfm_using_nmi));
+ /*
+ * invoke model specific acquire routine.
+ */
+ if (pmu_info->acquire_pmu_percpu)
+ pmu_info->acquire_pmu_percpu();
+}
+
+/**
+ * pfm_arch_pmu_acquire - acquire PMU resource from system
+ * @unavail_pmcs : bitmask to use to set unavailable pmcs
+ * @unavail_pmds : bitmask to use to set unavailable pmds
+ *
+ * interrupts are not masked
+ *
+ * Grab PMU registers from lower level MSR allocator
+ *
+ * Program the APIC according the possible interrupt vector
+ * either LOCAL_PERFMON_VECTOR or NMI
+ */
+int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds)
+{
+ struct pfm_arch_pmu_info *pmu_info;
+ struct pfm_regmap_desc *d;
+ u16 i, nlost;
+
+ pmu_info = pfm_pmu_conf->pmu_info;
+ pmu_info->flags &= ~PFM_X86_FL_SHARING;
+
+ nlost = 0;
+
+ d = pfm_pmu_conf->pmc_desc;
+ for (i = 0; i < pfm_pmu_conf->num_pmc_entries; i++, d++) {
+ if (!(d->type & PFM_REG_I))
+ continue;
+
+ /*
+ * reserve register with lower-level allocator
+ */
+ if (!reserve_evntsel_nmi(d->hw_addr)) {
+ PFM_DBG("pmc%d(%s) already used", i, d->desc);
+ pfm_arch_bv_set_bit(i, unavail_pmcs);
+ nlost++;
+ continue;
+ }
+ }
+ PFM_DBG("nlost=%d info_flags=0x%x\n", nlost, pmu_info->flags);
+ /*
+ * some PMU models (e.g., P6) do not support sharing
+ * so check if we found less than the expected number of PMC registers
+ */
+ if (nlost) {
+ if (pmu_info->flags & PFM_X86_FL_NO_SHARING) {
+ PFM_INFO("PMU already used by another subsystem, "
+ "PMU does not support sharing, "
+ "try disabling Oprofile or "
+ "reboot with nmi_watchdog=0");
+ goto undo;
+ }
+ pmu_info->flags |= PFM_X86_FL_SHARING;
+ }
+
+ d = pfm_pmu_conf->pmd_desc;
+ for (i = 0; i < pfm_pmu_conf->num_pmd_entries; i++, d++) {
+ if (!(d->type & PFM_REG_I))
+ continue;
+
+ if (!reserve_perfctr_nmi(d->hw_addr)) {
+ PFM_DBG("pmd%d(%s) already used", i, d->desc);
+ pfm_arch_bv_set_bit(i, unavail_pmds);
+ }
+ }
+ /*
+ * program APIC on each CPU
+ */
+ on_each_cpu(pfm_arch_pmu_acquire_percpu,
+ (void *)(unsigned long)pmu_info->flags , 1);
+
+ return 0;
+undo:
+ /*
+ * must undo reservation of pmcs in case of error
+ */
+ d = pfm_pmu_conf->pmc_desc;
+ for (i = 0; i < pfm_pmu_conf->num_pmc_entries; i++, d++) {
+ if (!(d->type & PFM_REG_I))
+ continue;
+ if (!pfm_arch_bv_test_bit(i, unavail_pmcs))
+ release_evntsel_nmi(d->hw_addr);
+ }
+ return -EBUSY;
+}
+
+/**
+ * pfm-arch_pmu_release_percpu - clear NMI state for one CPU
+ *
+ */
+static void pfm_arch_pmu_release_percpu(void *data)
+{
+ struct pfm_arch_pmu_info *pmu_info;
+
+ pmu_info = pfm_pmu_conf->pmu_info;
+
+ __get_cpu_var(pfm_using_nmi) = 0;
+ /*
+ * invoke model specific release routine.
+ */
+ if (pmu_info->release_pmu_percpu)
+ pmu_info->release_pmu_percpu();
+}
+
+/**
+ * pfm_arch_pmu_release - release PMU resource to system
+ *
+ * called from pfm_pmu_release()
+ * interrupts are not masked
+ *
+ * On x86, we return the PMU registers to the MSR allocator
+ */
+void pfm_arch_pmu_release(void)
+{
+ struct pfm_regmap_desc *d;
+ u16 i, n;
+
+ d = pfm_pmu_conf->pmc_desc;
+ n = pfm_pmu_conf->regs_all.num_pmcs;
+ for (i = 0; n; i++, d++) {
+ if (!pfm_arch_bv_test_bit(i, pfm_pmu_conf->regs_all.pmcs))
+ continue;
+ release_evntsel_nmi(d->hw_addr);
+ n--;
+ PFM_DBG("pmc%u released", i);
+ }
+ d = pfm_pmu_conf->pmd_desc;
+ n = pfm_pmu_conf->regs_all.num_pmds;
+ for (i = 0; n; i++, d++) {
+ if (!pfm_arch_bv_test_bit(i, pfm_pmu_conf->regs_all.pmds))
+ continue;
+ release_perfctr_nmi(d->hw_addr);
+ n--;
+ PFM_DBG("pmd%u released", i);
+ }
+
+ /* clear NMI variable if used */
+ if (__get_cpu_var(pfm_using_nmi))
+ on_each_cpu(pfm_arch_pmu_release_percpu, NULL , 1);
+}
+
+/**
+ * pfm_arch_init - one time global arch-specific initialization
+ *
+ * called from pfm_init()
+ */
+int __init pfm_arch_init(void)
+{
+ /*
+ * we need to register our NMI handler when the kernels boots
+ * to avoid a deadlock condition with the NMI watchdog or Oprofile
+ * if we were to try and register/unregister on-demand.
+ */
+ register_die_notifier(&pfm_nmi_nb);
+ return 0;
+}
diff --git a/arch/x86/perfmon/perfmon_amd64.c b/arch/x86/perfmon/perfmon_amd64.c
new file mode 100644
index 000000000000..f078fe28137d
--- /dev/null
+++ b/arch/x86/perfmon/perfmon_amd64.c
@@ -0,0 +1,483 @@
+/*
+ * This file contains the PMU description for the Athlon64 and Opteron64
+ * processors. It supports 32 and 64-bit modes.
+ *
+ * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Copyright (c) 2007 Advanced Micro Devices, Inc.
+ * Contributed by Robert Richter <robert.richter@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kprobes.h>
+#include <linux/vmalloc.h>
+#include <linux/topology.h>
+#include <linux/pci.h>
+#include <linux/perfmon_kern.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+
+static void __kprobes pfm_amd64_quiesce(void);
+static int pfm_amd64_has_ovfls(struct pfm_context *ctx);
+static int pfm_amd64_stop_save(struct pfm_context *ctx,
+ struct pfm_event_set *set);
+
+static u64 enable_mask[PFM_MAX_PMCS];
+static u16 max_enable;
+
+static struct pfm_arch_pmu_info pfm_amd64_pmu_info = {
+ .stop_save = pfm_amd64_stop_save,
+ .has_ovfls = pfm_amd64_has_ovfls,
+ .quiesce = pfm_amd64_quiesce,
+};
+
+/*
+ * force Local APIC interrupt on overflow
+ */
+#define PFM_K8_VAL (1ULL<<20)
+#define PFM_K8_NO64 (1ULL<<20)
+
+/*
+ * reserved bits must be 1
+ *
+ * for family 15:
+ * - upper 32 bits are reserved
+ * - bit 20, bit 21
+ *
+ * for family 16:
+ * - bits 36-39 are reserved
+ * - bits 42-63 are reserved
+ * - bit 20, bit 21
+ *
+ */
+#define PFM_K8_RSVD ((~((1ULL<<32)-1)) | (1ULL<<20) | (1ULL<<21))
+#define PFM_16_RSVD ((0x3fffffULL<<42) | (0xfULL<<36) | (1ULL<<20) | (1ULL<<21))
+
+static struct pfm_regmap_desc pfm_amd64_pmc_desc[] = {
+/* pmc0 */ PMC_D(PFM_REG_I64, "PERFSEL0", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL0),
+/* pmc1 */ PMC_D(PFM_REG_I64, "PERFSEL1", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL1),
+/* pmc2 */ PMC_D(PFM_REG_I64, "PERFSEL2", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL2),
+/* pmc3 */ PMC_D(PFM_REG_I64, "PERFSEL3", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL3),
+};
+#define PFM_AMD_NUM_PMCS ARRAY_SIZE(pfm_amd64_pmc_desc)
+
+/*
+ * AMD64 counters are 48 bits, upper bits are reserved
+ */
+#define PFM_AMD64_CTR_RSVD (~((1ULL<<48)-1))
+
+#define PFM_AMD_D(n) \
+ { .type = PFM_REG_C, \
+ .desc = "PERFCTR"#n, \
+ .hw_addr = MSR_K7_PERFCTR0+n, \
+ .rsvd_msk = PFM_AMD64_CTR_RSVD, \
+ .dep_pmcs[0] = 1ULL << n \
+ }
+
+static struct pfm_regmap_desc pfm_amd64_pmd_desc[] = {
+/* pmd0 */ PFM_AMD_D(0),
+/* pmd1 */ PFM_AMD_D(1),
+/* pmd2 */ PFM_AMD_D(2),
+/* pmd3 */ PFM_AMD_D(3)
+};
+#define PFM_AMD_NUM_PMDS ARRAY_SIZE(pfm_amd64_pmd_desc)
+
+static struct pfm_context *pfm_nb_task_owner;
+
+static struct pfm_pmu_config pfm_amd64_pmu_conf;
+
+/**
+ * pfm_amd64_acquire_nb -- ensure mutual exclusion for Northbridge events
+ * @ctx: context to use
+ *
+ * There can only be one user per socket for the Northbridge (NB) events,
+ * so we enforce mutual exclusion as follows:
+ * - per-thread : only one context machine-wide can use NB events
+ *
+ * Exclusion is enforced at:
+ * - pfm_load_context()
+ * - pfm_write_pmcs() for attached contexts
+ *
+ * Exclusion is released at:
+ * - pfm_unload_context() or any calls that implicitely uses it
+ *
+ * return:
+ * 0 : successfully acquire NB access
+ * < 0: errno, failed to acquire NB access
+ */
+static int pfm_amd64_acquire_nb(struct pfm_context *ctx)
+{
+ struct pfm_context **entry, *old;
+ int proc_id;
+
+#ifdef CONFIG_SMP
+ proc_id = cpu_data(smp_processor_id()).phys_proc_id;
+#else
+ proc_id = 0;
+#endif
+
+ entry = &pfm_nb_task_owner;
+
+ old = cmpxchg(entry, NULL, ctx);
+ if (!old) {
+ PFM_DBG("acquired Northbridge event access globally");
+ } else if (old != ctx) {
+ PFM_DBG("global NorthBridge event conflict");
+ return -EBUSY;
+ }
+ return 0;
+}
+
+/**
+ * pfm_amd64_pmc_write_check -- check validity of pmc writes
+ * @ctx: context to use
+ * @set: event set to use
+ * @req: user request to modify the pmc
+ *
+ * invoked from pfm_write_pmcs() when pfm_nb_sys_owners is not NULL,i.e.,
+ * when we have detected a multi-core processor.
+ *
+ * context is locked, interrupts are masked
+ */
+static int pfm_amd64_pmc_write_check(struct pfm_context *ctx,
+ struct pfm_event_set *set,
+ struct pfarg_pmr *req)
+{
+ unsigned int event;
+
+ /*
+ * delay checking NB event until we load the context
+ */
+ if (ctx->state == PFM_CTX_UNLOADED)
+ return 0;
+
+ /*
+ * check event is NB event
+ */
+ event = (unsigned int)(req->reg_value & 0xff);
+ if (event < 0xee)
+ return 0;
+
+ return pfm_amd64_acquire_nb(ctx);
+}
+
+/**
+ * pfm_amd64_load_context - amd64 model-specific load callback
+ * @ctx: context to use
+ *
+ * invoked on pfm_load_context().
+ * context is locked, interrupts are masked
+ */
+static int pfm_amd64_load_context(struct pfm_context *ctx)
+{
+ struct pfm_event_set *set;
+ unsigned int i, n;
+
+ set = ctx->active_set;
+ n = set->nused_pmcs;
+ for (i = 0; n; i++) {
+ if (!pfm_arch_bv_test_bit(i, set->used_pmcs))
+ continue;
+
+ if ((set->pmcs[i] & 0xff) >= 0xee)
+ goto found;
+ n--;
+ }
+ return 0;
+found:
+ return pfm_amd64_acquire_nb(ctx);
+}
+
+/**
+ * pfm_amd64_unload_context -- amd64 mdoels-specific unload callback
+ * @ctx: context to use
+ *
+ * invoked on pfm_unload_context()
+ */
+static void pfm_amd64_unload_context(struct pfm_context *ctx)
+{
+ struct pfm_context **entry, *old;
+ int proc_id;
+
+#ifdef CONFIG_SMP
+ proc_id = cpu_data(smp_processor_id()).phys_proc_id;
+#else
+ proc_id = 0;
+#endif
+
+ entry = &pfm_nb_task_owner;
+
+ old = cmpxchg(entry, ctx, NULL);
+ if (old == ctx)
+ PFM_DBG("released NorthBridge events globally");
+}
+
+/**
+ * pfm_amd64_setup_nb_event_ctrl -- initialize NB event controls
+ *
+ * detect if we need to activate NorthBridge event access control
+ */
+static int pfm_amd64_setup_nb_event_ctrl(void)
+{
+ unsigned int c, n = 0;
+ unsigned int max_phys = 0;
+
+#ifdef CONFIG_SMP
+ for_each_possible_cpu(c) {
+ if (cpu_data(c).phys_proc_id > max_phys)
+ max_phys = cpu_data(c).phys_proc_id;
+ }
+#else
+ max_phys = 0;
+#endif
+ if (max_phys > 255) {
+ PFM_INFO("socket id %d is too big to handle", max_phys);
+ return -ENOMEM;
+ }
+
+ n = max_phys + 1;
+ if (n < 2)
+ return 0;
+
+ pfm_nb_task_owner = NULL;
+
+ /*
+ * activate write-checker for PMC registers
+ */
+ for (c = 0; c < PFM_AMD_NUM_PMCS; c++)
+ pfm_amd64_pmc_desc[c].type |= PFM_REG_WC;
+
+ pfm_amd64_pmu_info.load_context = pfm_amd64_load_context;
+ pfm_amd64_pmu_info.unload_context = pfm_amd64_unload_context;
+
+ pfm_amd64_pmu_conf.pmc_write_check = pfm_amd64_pmc_write_check;
+
+ PFM_INFO("NorthBridge event access control enabled");
+
+ return 0;
+}
+
+/**
+ * pfm_amd64_setup_register -- initialize register table
+ *
+ * modify register table based on actual host CPU
+ */
+static void pfm_amd64_setup_registers(void)
+{
+ u16 i;
+
+ pfm_arch_bv_set_bit(0, enable_mask);
+ pfm_arch_bv_set_bit(1, enable_mask);
+ pfm_arch_bv_set_bit(2, enable_mask);
+ pfm_arch_bv_set_bit(3, enable_mask);
+ max_enable = 3+1;
+
+ /*
+ * adjust reserved bit fields for family 16
+ */
+ if (current_cpu_data.x86 == 16) {
+ for (i = 0; i < PFM_AMD_NUM_PMCS; i++)
+ if (pfm_amd64_pmc_desc[i].rsvd_msk == PFM_K8_RSVD)
+ pfm_amd64_pmc_desc[i].rsvd_msk = PFM_16_RSVD;
+ }
+}
+
+/**
+ * pfm_amd64_probe_pmu -- detect host PMU
+ */
+static int pfm_amd64_probe_pmu(void)
+{
+ if (current_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return -1;
+
+ switch (current_cpu_data.x86) {
+ case 6:
+ case 15:
+ case 16:
+ PFM_INFO("found family=%d", current_cpu_data.x86);
+ break;
+ default:
+ PFM_INFO("unsupported family=%d", current_cpu_data.x86);
+ return -1;
+ }
+
+ /*
+ * check for local APIC (required)
+ */
+ if (!cpu_has_apic) {
+ PFM_INFO("no local APIC, unsupported");
+ return -1;
+ }
+
+ if (current_cpu_data.x86_max_cores > 1
+ && pfm_amd64_setup_nb_event_ctrl())
+ return -1;
+
+ pfm_amd64_setup_registers();
+
+ return 0;
+}
+
+/**
+ * pfm_amd64_has_ovfls -- detect if pending overflows
+ * @ctx: context to use
+ *
+ * detect is counters have overflowed.
+ * return:
+ * 0 : no overflow
+ * 1 : at least one overflow
+ */
+static int __kprobes pfm_amd64_has_ovfls(struct pfm_context *ctx)
+{
+ struct pfm_regmap_desc *xrd;
+ u64 *cnt_mask;
+ u64 wmask, val;
+ u16 i, num;
+
+ /*
+ * Check regular counters
+ */
+ cnt_mask = ctx->regs.cnt_pmds;
+ num = ctx->regs.num_counters;
+ wmask = 1ULL << pfm_pmu_conf->counter_width;
+ xrd = pfm_amd64_pmd_desc;
+
+ for (i = 0; num; i++) {
+ if (pfm_arch_bv_test_bit(i, cnt_mask)) {
+ rdmsrl(xrd[i].hw_addr, val);
+ if (!(val & wmask))
+ return 1;
+ num--;
+ }
+ }
+ return 0;
+}
+
+/**
+ * pfm_amd64_stop_save - stop monitoring, collect pending overflows
+ * @ctx: context to use
+ * @set: event set to stop
+ *
+ * interrupts are masked, PMU access guaranteed
+ */
+static int pfm_amd64_stop_save(struct pfm_context *ctx,
+ struct pfm_event_set *set)
+{
+ struct pfm_arch_pmu_info *pmu_info;
+ u64 used_mask[PFM_PMC_BV];
+ u64 *cnt_pmds;
+ u64 val, wmask, ovfl_mask;
+ u32 i, count;
+
+ pmu_info = pfm_pmu_info();
+
+ wmask = 1ULL << pfm_pmu_conf->counter_width;
+
+ pfm_arch_bv_and(used_mask,
+ set->used_pmcs,
+ enable_mask,
+ max_enable);
+
+ count = pfm_arch_bv_weight(used_mask, max_enable);
+
+ /*
+ * stop monitoring
+ * Unfortunately, this is very expensive!
+ * wrmsrl() is serializing.
+ */
+ for (i = 0; count; i++) {
+ if (pfm_arch_bv_test_bit(i, used_mask)) {
+ wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0);
+ count--;
+ }
+ }
+
+ /*
+ * if we already having a pending overflow condition, we simply
+ * return to take care of this first.
+ */
+ if (set->npend_ovfls)
+ return 1;
+
+ ovfl_mask = pfm_pmu_conf->ovfl_mask;
+ cnt_pmds = ctx->regs.cnt_pmds;
+
+ /*
+ * check for pending overflows and save PMDs (combo)
+ * we employ used_pmds because we also need to save
+ * and not just check for pending interrupts.
+ */
+ count = set->nused_pmds;
+ for (i = 0; count; i++) {
+ if (pfm_arch_bv_test_bit(i, set->used_pmds)) {
+ val = pfm_arch_read_pmd(ctx, i);
+ if (likely(pfm_arch_bv_test_bit(i, cnt_pmds))) {
+ if (!(val & wmask)) {
+ pfm_arch_bv_set_bit(i,set->povfl_pmds);
+ set->npend_ovfls++;
+ }
+ val = (set->pmds[i] & ~ovfl_mask)
+ | (val & ovfl_mask);
+ }
+ set->pmds[i] = val;
+ count--;
+ }
+ }
+ /* 0 means: no need to save PMDs at upper level */
+ return 0;
+}
+
+/**
+ * pfm_amd64_quiesce_pmu -- stop monitoring without grabbing any lock
+ *
+ * called from NMI interrupt handler to immediately stop monitoring
+ * cannot grab any lock, including perfmon related locks
+ */
+static void __kprobes pfm_amd64_quiesce(void)
+{
+ /*
+ * quiesce PMU by clearing available registers that have
+ * the start/stop capability
+ */
+ if (pfm_arch_bv_test_bit(0, pfm_pmu_conf->regs_all.pmcs))
+ wrmsrl(MSR_K7_EVNTSEL0, 0);
+ if (pfm_arch_bv_test_bit(1, pfm_pmu_conf->regs_all.pmcs))
+ wrmsrl(MSR_K7_EVNTSEL0+1, 0);
+ if (pfm_arch_bv_test_bit(2, pfm_pmu_conf->regs_all.pmcs))
+ wrmsrl(MSR_K7_EVNTSEL0+2, 0);
+ if (pfm_arch_bv_test_bit(3, pfm_pmu_conf->regs_all.pmcs))
+ wrmsrl(MSR_K7_EVNTSEL0+3, 0);
+}
+
+static struct pfm_pmu_config pfm_amd64_pmu_conf = {
+ .pmu_name = "AMD64",
+ .counter_width = 47,
+ .pmd_desc = pfm_amd64_pmd_desc,
+ .pmc_desc = pfm_amd64_pmc_desc,
+ .num_pmc_entries = PFM_AMD_NUM_PMCS,
+ .num_pmd_entries = PFM_AMD_NUM_PMDS,
+ .version = "1.2",
+ .pmu_info = &pfm_amd64_pmu_info
+};
+
+static int __init pfm_amd64_pmu_init_module(void)
+{
+ if (pfm_amd64_probe_pmu())
+ return -ENOSYS;
+ return pfm_pmu_register(&pfm_amd64_pmu_conf);
+}
+
+device_initcall(pfm_amd64_pmu_init_module);
diff --git a/arch/x86/perfmon/perfmon_intel_arch.c b/arch/x86/perfmon/perfmon_intel_arch.c
new file mode 100644
index 000000000000..ce4293dcfcda
--- /dev/null
+++ b/arch/x86/perfmon/perfmon_intel_arch.c
@@ -0,0 +1,628 @@
+/*
+ * This file contains the Intel architectural perfmon v1, v2, v3
+ * description tables.
+ *
+ * Architectural perfmon was introduced with Intel Core Solo/Duo
+ * processors.
+ *
+ * Copyright (c) 2006-2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kprobes.h>
+#include <linux/perfmon_kern.h>
+#include <asm/msr.h>
+#include <asm/apic.h>
+
+static u64 enable_mask[PFM_MAX_PMCS];
+static u16 max_enable;
+static int pfm_intel_arch_version;
+
+DEFINE_PER_CPU(u64, saved_global_ctrl);
+
+/*
+ * - upper 32 bits are reserved
+ * - INT: APIC enable bit is reserved (forced to 1)
+ * - bit 21 is reserved
+ *
+ * RSVD: reserved bits are 1
+ */
+#define PFM_IA_PMC_RSVD ((~((1ULL<<32)-1)) \
+ | (1ULL<<20) \
+ | (1ULL<<21))
+
+/*
+ * force Local APIC interrupt on overflow
+ * disable with NO_EMUL64
+ */
+#define PFM_IA_PMC_VAL (1ULL<<20)
+#define PFM_IA_NO64 (1ULL<<20)
+
+/*
+ * architectuture specifies that:
+ * IA32_PMCx MSR : starts at 0x0c1 & occupy a contiguous block of MSR
+ * IA32_PERFEVTSELx MSR : starts at 0x186 & occupy a contiguous block of MSR
+ * MSR_GEN_FIXED_CTR0 : starts at 0x309 & occupy a contiguous block of MSR
+ */
+#define MSR_GEN_SEL_BASE MSR_P6_EVNTSEL0
+#define MSR_GEN_PMC_BASE MSR_P6_PERFCTR0
+#define MSR_GEN_FIXED_PMC_BASE MSR_CORE_PERF_FIXED_CTR0
+
+/*
+ * layout of EAX for CPUID.0xa leaf function
+ */
+struct pmu_eax {
+ unsigned int version:8; /* architectural perfmon version */
+ unsigned int num_cnt:8; /* number of generic counters */
+ unsigned int cnt_width:8; /* width of generic counters */
+ unsigned int ebx_length:8; /* number of architected events */
+};
+
+/*
+ * layout of EDX for CPUID.0xa leaf function when perfmon v2 is detected
+ */
+struct pmu_edx {
+ unsigned int num_cnt:5; /* number of fixed counters */
+ unsigned int cnt_width:8; /* width of fixed counters */
+ unsigned int reserved:19;
+};
+
+static void pfm_intel_arch_acquire_pmu_percpu(void);
+static void pfm_intel_arch_release_pmu_percpu(void);
+static int pfm_intel_arch_stop_save(struct pfm_context *ctx,
+ struct pfm_event_set *set);
+static int pfm_intel_arch_has_ovfls(struct pfm_context *ctx);
+static void __kprobes pfm_intel_arch_quiesce(void);
+
+/*
+ * physical addresses of MSR controlling the perfevtsel and counter registers
+ */
+struct pfm_arch_pmu_info pfm_intel_arch_pmu_info = {
+ .stop_save = pfm_intel_arch_stop_save,
+ .has_ovfls = pfm_intel_arch_has_ovfls,
+ .quiesce = pfm_intel_arch_quiesce,
+ .acquire_pmu_percpu = pfm_intel_arch_acquire_pmu_percpu,
+ .release_pmu_percpu = pfm_intel_arch_release_pmu_percpu
+};
+
+#define PFM_IA_C(n) { \
+ .type = PFM_REG_I64, \
+ .desc = "PERFEVTSEL"#n, \
+ .dfl_val = PFM_IA_PMC_VAL, \
+ .rsvd_msk = PFM_IA_PMC_RSVD, \
+ .no_emul64_msk = PFM_IA_NO64, \
+ .hw_addr = MSR_GEN_SEL_BASE+(n) \
+ }
+
+#define PFM_IA_D(n) \
+ { .type = PFM_REG_C, \
+ .desc = "PMC"#n, \
+ .hw_addr = MSR_P6_PERFCTR0+n, \
+ .dep_pmcs[0] = 1ULL << n \
+ }
+
+#define PFM_IA_FD(n) \
+ { .type = PFM_REG_C, \
+ .desc = "FIXED_CTR"#n, \
+ .hw_addr = MSR_CORE_PERF_FIXED_CTR0+n,\
+ .dep_pmcs[0] = 1ULL << 16 \
+ }
+
+
+static struct pfm_regmap_desc pfm_intel_arch_pmc_desc[] = {
+/* pmc0 */ PFM_IA_C(0), PFM_IA_C(1), PFM_IA_C(2), PFM_IA_C(3),
+/* pmc4 */ PFM_IA_C(4), PFM_IA_C(5), PFM_IA_C(6), PFM_IA_C(7),
+/* pmc8 */ PFM_IA_C(8), PFM_IA_C(9), PFM_IA_C(10), PFM_IA_C(11),
+/* pmc12 */ PFM_IA_C(12), PFM_IA_C(13), PFM_IA_C(14), PFM_IA_C(15),
+
+/* pmc16 */ { .type = PFM_REG_I,
+ .desc = "FIXED_CTRL",
+ .dfl_val = 0x8888888888888888ULL, /* force PMI */
+ .rsvd_msk = 0, /* set dynamically */
+ .no_emul64_msk = 0,
+ .hw_addr = MSR_CORE_PERF_FIXED_CTR_CTRL
+ },
+};
+#define PFM_IA_MAX_PMCS ARRAY_SIZE(pfm_intel_arch_pmc_desc)
+
+static struct pfm_regmap_desc pfm_intel_arch_pmd_desc[] = {
+/* pmd0 */ PFM_IA_D(0), PFM_IA_D(1), PFM_IA_D(2), PFM_IA_D(3),
+/* pmd4 */ PFM_IA_D(4), PFM_IA_D(5), PFM_IA_D(6), PFM_IA_D(7),
+/* pmd8 */ PFM_IA_D(8), PFM_IA_D(9), PFM_IA_D(10), PFM_IA_D(11),
+/* pmd12 */ PFM_IA_D(12), PFM_IA_D(13), PFM_IA_D(14), PFM_IA_D(15),
+
+/* pmd16 */ PFM_IA_FD(0), PFM_IA_FD(1), PFM_IA_FD(2), PFM_IA_FD(3),
+/* pmd20 */ PFM_IA_FD(4), PFM_IA_FD(5), PFM_IA_FD(6), PFM_IA_FD(7),
+/* pmd24 */ PFM_IA_FD(8), PFM_IA_FD(9), PFM_IA_FD(10), PFM_IA_FD(11),
+/* pmd28 */ PFM_IA_FD(16), PFM_IA_FD(17), PFM_IA_FD(18), PFM_IA_FD(19)
+};
+#define PFM_IA_MAX_PMDS ARRAY_SIZE(pfm_intel_arch_pmd_desc)
+
+#define PFM_IA_MAX_CNT 16 /* # generic counters in mapping table */
+#define PFM_IA_MAX_FCNT 16 /* # of fixed counters in mapping table */
+#define PFM_IA_FCNT_BASE 16 /* base index of fixed counters PMD */
+
+static struct pfm_pmu_config pfm_intel_arch_pmu_conf;
+
+static void pfm_intel_arch_check_errata(void)
+{
+ /*
+ * Core Duo errata AE49 (no fix). Both counters share a single
+ * enable bit in PERFEVTSEL0
+ */
+ if (current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 14)
+ pfm_intel_arch_pmu_info.flags |= PFM_X86_FL_NO_SHARING;
+}
+
+static inline void set_enable_mask(unsigned int i)
+{
+ pfm_arch_bv_set_bit(i, enable_mask);
+
+ /* max_enable = highest + 1 */
+ if ((i+1) > max_enable)
+ max_enable = i+ 1;
+}
+
+static void pfm_intel_arch_setup_generic(unsigned int version,
+ unsigned int width,
+ unsigned int count)
+{
+ u64 rsvd;
+ unsigned int i;
+
+ /*
+ * first we handle the generic counters:
+ *
+ * - ensure HW does not have more registers than hardcoded in the tables
+ * - adjust rsvd_msk to actual counter width
+ * - initialize enable_mask (list of PMC with start/stop capability)
+ * - mark unused hardcoded generic counters as unimplemented
+ */
+
+ /*
+ * min of number of Hw counters and hardcoded in the tables
+ */
+ if (count >= PFM_IA_MAX_CNT) {
+ printk(KERN_INFO "perfmon: Limiting number of generic counters"
+ " to %u, HW supports %u",
+ PFM_IA_MAX_CNT, count);
+ count = PFM_IA_MAX_CNT;
+ }
+
+ /*
+ * adjust rsvd_msk for generic counters based on actual width
+ * initialize enable_mask (1 per pmd)
+ */
+ rsvd = ~((1ULL << width)-1);
+ for (i = 0; i < count; i++) {
+ pfm_intel_arch_pmd_desc[i].rsvd_msk = rsvd;
+ set_enable_mask(i);
+ }
+
+ /*
+ * handle version 3 new anythread bit (21)
+ */
+ if (version == 3) {
+ for (i = 0; i < count; i++)
+ pfm_intel_arch_pmc_desc[i].rsvd_msk &= ~(1ULL << 21);
+ }
+
+
+ /*
+ * mark unused generic counters as not available
+ */
+ for (i = count ; i < PFM_IA_MAX_CNT; i++) {
+ pfm_intel_arch_pmd_desc[i].type = PFM_REG_NA;
+ pfm_intel_arch_pmc_desc[i].type = PFM_REG_NA;
+ }
+}
+
+static void pfm_intel_arch_setup_fixed(unsigned int version,
+ unsigned int width,
+ unsigned int count)
+{
+ u64 rsvd, dfl;
+ unsigned int i;
+
+ /*
+ * handle the fixed counters (if any):
+ *
+ * - ensure HW does not have more registers than hardcoded in the tables
+ * - adjust rsvd_msk to actual counter width
+ * - initialize enable_mask (list of PMC with start/stop capability)
+ * - mark unused hardcoded generic counters as unimplemented
+ */
+ if (count >= PFM_IA_MAX_FCNT) {
+ printk(KERN_INFO "perfmon: Limiting number of fixed counters"
+ " to %u, HW supports %u",
+ PFM_IA_MAX_FCNT, count);
+ count = PFM_IA_MAX_FCNT;
+ }
+ /*
+ * adjust rsvd_msk for fixed counters based on actual width
+ */
+ rsvd = ~((1ULL << width)-1);
+ for (i = 0; i < count; i++)
+ pfm_intel_arch_pmd_desc[PFM_IA_FCNT_BASE+i].rsvd_msk = rsvd;
+
+ /*
+ * handle version new anythread bit (bit 2)
+ */
+ if (version == 3)
+ rsvd = 1ULL << 3;
+ else
+ rsvd = 3ULL << 2;
+
+ pfm_intel_arch_pmc_desc[16].rsvd_msk = 0;
+ for (i = 0; i < count; i++)
+ pfm_intel_arch_pmc_desc[16].rsvd_msk |= rsvd << (i<<2);
+
+ /*
+ * mark unused fixed counters as unimplemented
+ *
+ * update the rsvd_msk, dfl_val in FIXED_CTRL:
+ * - rsvd_msk: set all 4 bits
+ * - dfl_val : clear all 4 bits
+ */
+ dfl = pfm_intel_arch_pmc_desc[16].dfl_val;
+ rsvd = pfm_intel_arch_pmc_desc[16].rsvd_msk;
+
+ for (i = count ; i < PFM_IA_MAX_FCNT; i++) {
+ pfm_intel_arch_pmd_desc[PFM_IA_FCNT_BASE+i].type = PFM_REG_NA;
+ rsvd |= 0xfULL << (i<<2);
+ dfl &= ~(0xfULL << (i<<2));
+ }
+
+ /*
+ * FIXED_CTR_CTRL unavailable when no fixed counters are defined
+ */
+ if (!count) {
+ pfm_intel_arch_pmc_desc[16].type = PFM_REG_NA;
+ } else {
+ /* update rsvd_mask and dfl_val */
+ pfm_intel_arch_pmc_desc[16].rsvd_msk = rsvd;
+ pfm_intel_arch_pmc_desc[16].dfl_val = dfl;
+ set_enable_mask(16);
+ }
+}
+
+static int pfm_intel_arch_probe_pmu(void)
+{
+ union {
+ unsigned int val;
+ struct pmu_eax eax;
+ struct pmu_edx edx;
+ } eax, edx;
+ unsigned int ebx, ecx;
+ unsigned int width = 0;
+
+ edx.val = 0;
+
+ if (!cpu_has_arch_perfmon) {
+ PFM_INFO("no support for Intel architectural PMU");
+ return -1;
+ }
+
+ if (!cpu_has_apic) {
+ PFM_INFO("no Local APIC, try rebooting with lapic option");
+ return -1;
+ }
+
+ /* cpuid() call protected by cpu_has_arch_perfmon */
+ cpuid(0xa, &eax.val, &ebx, &ecx, &edx.val);
+
+ /*
+ * some 6/15 models have buggy BIOS
+ */
+ if (eax.eax.version == 0
+ && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 15) {
+ PFM_INFO("buggy v2 BIOS, adjusting for 2 generic counters");
+ eax.eax.version = 2;
+ eax.eax.num_cnt = 2;
+ eax.eax.cnt_width = 40;
+ }
+
+ /*
+ * some v2 BIOSes are incomplete
+ */
+ if (eax.eax.version == 2 && !edx.edx.num_cnt) {
+ PFM_INFO("buggy v2 BIOS, adjusting for 3 fixed counters");
+ edx.edx.num_cnt = 3;
+ edx.edx.cnt_width = 40;
+ }
+
+ /*
+ * no fixed counters on earlier versions
+ */
+ if (eax.eax.version < 2) {
+ edx.val = 0;
+ } else {
+ /*
+ * use the min value of both widths until we support
+ * variable width counters
+ */
+ width = eax.eax.cnt_width < edx.edx.cnt_width ?
+ eax.eax.cnt_width : edx.edx.cnt_width;
+ }
+
+ /*
+ * Intel Atom processors have a buggy firmware which does not report
+ * the correct number of fixed counters
+ */
+ if (eax.eax.version == 3 && edx.edx.num_cnt < 3
+ && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 28) {
+ PFM_INFO("buggy v3 BIOS, adjusting for 3 fixed counters");
+ edx.edx.num_cnt = 3;
+ }
+
+ PFM_INFO("detected architecural perfmon v%d", eax.eax.version);
+ PFM_INFO("num_gen=%d width=%d num_fixed=%d width=%d",
+ eax.eax.num_cnt,
+ eax.eax.cnt_width,
+ edx.edx.num_cnt,
+ edx.edx.cnt_width);
+
+ pfm_intel_arch_setup_generic(eax.eax.version,
+ width,
+ eax.eax.num_cnt);
+
+ pfm_intel_arch_setup_fixed(eax.eax.version,
+ width,
+ edx.edx.num_cnt);
+
+ pfm_intel_arch_check_errata();
+
+ pfm_intel_arch_version = eax.eax.version;
+
+ return 0;
+}
+
+/**
+ * pfm_intel_arch_has_ovfls - check for pending overflow condition
+ * @ctx: context to work on
+ *
+ * detect if counters have overflowed.
+ * return:
+ * 0 : no overflow
+ * 1 : at least one overflow
+ */
+static int __kprobes pfm_intel_arch_has_ovfls(struct pfm_context *ctx)
+{
+ u64 *cnt_mask;
+ u64 wmask, val;
+ u16 i, num;
+
+ cnt_mask = ctx->regs.cnt_pmds;
+ num = ctx->regs.num_counters;
+ wmask = 1ULL << pfm_pmu_conf->counter_width;
+
+ /*
+ * we can leverage the fact that we know the mapping
+ * to hardcode the MSR address and avoid accessing
+ * more cachelines
+ *
+ * We need to check cnt_mask because not all registers
+ * may be available.
+ */
+ for (i = 0; num; i++) {
+ if (pfm_arch_bv_test_bit(i, cnt_mask)) {
+ rdmsrl(pfm_intel_arch_pmd_desc[i].hw_addr, val);
+ if (!(val & wmask))
+ return 1;
+ num--;
+ }
+ }
+ return 0;
+}
+
+static int pfm_intel_arch_stop_save(struct pfm_context *ctx,
+ struct pfm_event_set *set)
+{
+ u64 used_mask[PFM_PMC_BV];
+ u64 val, wmask, ovfl_mask;
+ u32 i, count;
+
+ wmask = 1ULL << pfm_pmu_conf->counter_width;
+
+ pfm_arch_bv_and(used_mask,
+ set->used_pmcs,
+ enable_mask,
+ max_enable);
+
+ count = pfm_arch_bv_weight(used_mask, max_enable);
+
+ /*
+ * stop monitoring
+ * Unfortunately, this is very expensive!
+ * wrmsrl() is serializing.
+ */
+ for (i = 0; count; i++) {
+ if (pfm_arch_bv_test_bit(i, used_mask)) {
+ wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0);
+ count--;
+ }
+ }
+
+ /*
+ * if we already having a pending overflow condition, we simply
+ * return to take care of this first.
+ */
+ if (set->npend_ovfls)
+ return 1;
+
+ ovfl_mask = pfm_pmu_conf->ovfl_mask;
+
+ /*
+ * check for pending overflows and save PMDs (combo)
+ * we employ used_pmds because we also need to save
+ * and not just check for pending interrupts.
+ *
+ * all pmds are counters
+ */
+ count = set->nused_pmds;
+ for (i = 0; count; i++) {
+ if (pfm_arch_bv_test_bit(i, set->used_pmds)) {
+ val = pfm_arch_read_pmd(ctx, i);
+ if (!(val & wmask)) {
+ pfm_arch_bv_set_bit(i, set->povfl_pmds);
+ set->npend_ovfls++;
+ }
+ val = (set->pmds[i] & ~ovfl_mask)
+ | (val & ovfl_mask);
+ set->pmds[i] = val;
+ count--;
+ }
+ }
+ /* 0 means: no need to save PMDs at upper level */
+ return 0;
+}
+
+/**
+ * pfm_intel_arch_quiesce - stop monitoring without grabbing any lock
+ *
+ * called from NMI interrupt handler to immediately stop monitoring
+ * cannot grab any lock, including perfmon related locks
+ */
+static void __kprobes pfm_intel_arch_quiesce(void)
+{
+ u16 i;
+
+ /*
+ * PMC16 is the fixed control register so it has a
+ * distinct MSR address
+ *
+ * We do not use the hw_addr field in the table to avoid touching
+ * too many cachelines
+ */
+ for (i = 0; i < pfm_pmu_conf->regs_all.max_pmc; i++) {
+ if (pfm_arch_bv_test_bit(i, pfm_pmu_conf->regs_all.pmcs)) {
+ if (i == 16)
+ wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
+ else
+ wrmsrl(MSR_P6_EVNTSEL0+i, 0);
+ }
+ }
+}
+/**
+* pfm_intel_arch_release_pmu_percpu - release PMU resource per CPU
+*
+* Since v2, there exists global control MSR, to start/stop and
+* also collect overflow status information. In particular,
+* GLOBAL_CTRL controls start/stop and has one bit per counter.
+* To maintain backward compatibility with v1, the power-on value
+* of GLOBAL_CTRL should be such that generic counters are enabled
+* but fixed counters are disabled (true on Penryn and Atom currently).
+*
+* Here, we simply make sure that all available counters are enabled.
+* After that, start/stop is controlled on a per-counter basis.
+*/
+static void pfm_intel_arch_acquire_pmu_percpu(void)
+{
+ struct pfm_regmap_desc *d;
+ u64 mask = 0;
+ unsigned int i;
+
+ /* nothing to do for v1 */
+ if (pfm_intel_arch_version < 2)
+ return;
+
+ /*
+ * build bitmask of registers that are available to
+ * us. In some cases, there may be fewer registers than
+ * what the PMU supports due to sharing with other kernel
+ * subsystems, such as NMI
+ */
+ d = pfm_pmu_conf->pmd_desc;
+ for (i=0; i < 16; i++) {
+ if ((d[i].type & PFM_REG_I) == 0)
+ continue;
+ mask |= 1ull << i;
+ }
+ for (i=16; i < PFM_IA_MAX_PMDS; i++) {
+ if ((d[i].type & PFM_REG_I) == 0)
+ continue;
+ mask |= 1ull << (32+i-16);
+ }
+ /*
+ * keep a local copy of the current MSR_CORE_PERF_GLOBAL_CTRL
+ */
+ rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, __get_cpu_var(saved_global_ctrl));
+
+ PFM_DBG("global=0x%llx set to 0x%llx",
+ __get_cpu_var(saved_global_ctrl),
+ mask);
+ /*
+ * enable all registers
+ *
+ * No need to quiesce PMU. If there is a overflow, it will be
+ * treated as spurious by the handler
+ */
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, mask);
+}
+
+/**
+* pfm_intel_arch_release_pmu_percpu - release PMU resource per CPU
+*
+* Since v2, there exists global control MSR, to start/stop and
+* also collect overflow status information. In particular,
+* GLOBAL_CTRL controls start/stop and has one bit per counter.
+* To maintain backward compatibility with v1, the power-on value
+* of GLOBAL_CTRL should be such that generic counters are enabled
+* but fixed counters are disabled (true on Penryn and Atom currently).
+*
+* Here, we are done using the PMU. so we restore the power-on value.
+*/
+static void pfm_intel_arch_release_pmu_percpu(void)
+{
+ /* nothing to do for v1 */
+ if (pfm_intel_arch_version < 2)
+ return;
+
+ PFM_DBG("global_ctrl restored to 0x%llx\n",
+ __get_cpu_var(saved_global_ctrl));
+
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, __get_cpu_var(saved_global_ctrl));
+}
+
+/*
+ * Counters may have model-specific width. Yet the documentation says
+ * that only the lower 32 bits can be written to due to the specification
+ * of wrmsr. bits [32-(w-1)] are sign extensions of bit 31. Bits [w-63] must
+ * not be set (see rsvd_msk for PMDs). As such the effective width of a
+ * counter is 31 bits only regardless of what CPUID.0xa returns.
+ *
+ * See IA-32 Intel Architecture Software developer manual Vol 3B chapter 18
+ */
+static struct pfm_pmu_config pfm_intel_arch_pmu_conf = {
+ .pmu_name = "Intel architectural",
+ .pmd_desc = pfm_intel_arch_pmd_desc,
+ .counter_width = 31,
+ .num_pmc_entries = PFM_IA_MAX_PMCS,
+ .num_pmd_entries = PFM_IA_MAX_PMDS,
+ .pmc_desc = pfm_intel_arch_pmc_desc,
+ .version = "1.0",
+ .pmu_info = &pfm_intel_arch_pmu_info
+};
+
+static int __init pfm_intel_arch_pmu_init_module(void)
+{
+ if (pfm_intel_arch_probe_pmu())
+ return -ENOSYS;
+
+ return pfm_pmu_register(&pfm_intel_arch_pmu_conf);
+}
+
+device_initcall(pfm_intel_arch_pmu_init_module);