43 files changed, 2371 insertions, 37 deletions
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 6bd91ed7cd03..ad604df6a2b6 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -505,7 +505,7 @@ config COMPAT_FOR_U64_ALIGNMENT
 config IA64_MCA_RECOVERY
 	tristate "MCA recovery from errors other than TLB."
 
-config PERFMON
+config PERFMON_V20
 	bool "Performance monitor support"
 	help
 	  Selects whether support for the IA-64 performance monitor hardware
diff --git a/arch/ia64/configs/bigsur_defconfig b/arch/ia64/configs/bigsur_defconfig
index 6dd8655664f3..2c04fbe6c414 100644
--- a/arch/ia64/configs/bigsur_defconfig
+++ b/arch/ia64/configs/bigsur_defconfig
@@ -134,7 +134,7 @@ CONFIG_ARCH_SPARSEMEM_ENABLE=y
 CONFIG_IA32_SUPPORT=y
 CONFIG_COMPAT=y
 # CONFIG_IA64_MCA_RECOVERY is not set
-CONFIG_PERFMON=y
+CONFIG_PERFMON_V20=y
 CONFIG_IA64_PALINFO=y
 
 #
diff --git a/arch/ia64/configs/generic_defconfig b/arch/ia64/configs/generic_defconfig
index e05f9e1d3faa..7d89a19fc8b3 100644
--- a/arch/ia64/configs/generic_defconfig
+++ b/arch/ia64/configs/generic_defconfig
@@ -209,7 +209,7 @@ CONFIG_IA32_SUPPORT=y
 CONFIG_COMPAT=y
 CONFIG_COMPAT_FOR_U64_ALIGNMENT=y
 CONFIG_IA64_MCA_RECOVERY=y
-CONFIG_PERFMON=y
+CONFIG_PERFMON_V20=y
 CONFIG_IA64_PALINFO=y
 # CONFIG_IA64_MC_ERR_INJECT is not set
 CONFIG_SGI_SN=y
diff --git a/arch/ia64/configs/gensparse_defconfig b/arch/ia64/configs/gensparse_defconfig
index e86fbd39c795..5f8c7721e29a 100644
--- a/arch/ia64/configs/gensparse_defconfig
+++ b/arch/ia64/configs/gensparse_defconfig
@@ -142,7 +142,7 @@ CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
 CONFIG_IA32_SUPPORT=y
 CONFIG_COMPAT=y
 CONFIG_IA64_MCA_RECOVERY=y
-CONFIG_PERFMON=y
+CONFIG_PERFMON_V20=y
 CONFIG_IA64_PALINFO=y
 CONFIG_SGI_SN=y
 
diff --git a/arch/ia64/configs/sim_defconfig b/arch/ia64/configs/sim_defconfig
index 546a772f438e..d51457af7ca6 100644
--- a/arch/ia64/configs/sim_defconfig
+++ b/arch/ia64/configs/sim_defconfig
@@ -133,7 +133,7 @@ CONFIG_ARCH_SPARSEMEM_ENABLE=y
 CONFIG_IA32_SUPPORT=y
 CONFIG_COMPAT=y
 # CONFIG_IA64_MCA_RECOVERY is not set
-# CONFIG_PERFMON is not set
+# CONFIG_PERFMON_V20 is not set
 CONFIG_IA64_PALINFO=m
 
 #
diff --git a/arch/ia64/configs/tiger_defconfig b/arch/ia64/configs/tiger_defconfig
index c522edf23c62..318d846ab253 100644
--- a/arch/ia64/configs/tiger_defconfig
+++ b/arch/ia64/configs/tiger_defconfig
@@ -156,7 +156,7 @@ CONFIG_VIRTUAL_MEM_MAP=y
 CONFIG_HOLES_IN_ZONE=y
 # CONFIG_IA32_SUPPORT is not set
 CONFIG_IA64_MCA_RECOVERY=y
-CONFIG_PERFMON=y
+CONFIG_PERFMON_V20=y
 CONFIG_IA64_PALINFO=y
 # CONFIG_IA64_MC_ERR_INJECT is not set
 # CONFIG_IA64_ESI is not set
diff --git a/arch/ia64/configs/zx1_defconfig b/arch/ia64/configs/zx1_defconfig
index 0a06b1333c95..2bf0ad40398f 100644
--- a/arch/ia64/configs/zx1_defconfig
+++ b/arch/ia64/configs/zx1_defconfig
@@ -153,7 +153,7 @@ CONFIG_HOLES_IN_ZONE=y
 CONFIG_IA32_SUPPORT=y
 CONFIG_COMPAT=y
 CONFIG_IA64_MCA_RECOVERY=y
-CONFIG_PERFMON=y
+CONFIG_PERFMON_V20=y
 CONFIG_IA64_PALINFO=y
 # CONFIG_IA64_ESI is not set
 # CONFIG_KEXEC is not set
diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h
index f88fa054d01d..3ecf7e0b44cb 100644
--- a/arch/ia64/include/asm/processor.h
+++ b/arch/ia64/include/asm/processor.h
@@ -321,7 +321,7 @@ struct thread_struct {
 #else
 # define INIT_THREAD_IA32
 #endif /* CONFIG_IA32_SUPPORT */
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 	void *pfm_context;		     /* pointer to detailed PMU context */
 	unsigned long pfm_needs_checking;    /* when >0, pending perfmon work on kernel exit */
 # define INIT_THREAD_PM		.pfm_context =		NULL,     \
diff --git a/arch/ia64/include/asm/system.h b/arch/ia64/include/asm/system.h
index 927a381c20ca..387e54030af1 100644
--- a/arch/ia64/include/asm/system.h
+++ b/arch/ia64/include/asm/system.h
@@ -224,7 +224,7 @@ extern void ia64_account_on_switch (struct task_struct *prev, struct task_struct
 # define IA64_ACCOUNT_ON_SWITCH(p,n)
 #endif
 
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
   DECLARE_PER_CPU(unsigned long, pfm_syst_info);
 # define PERFMON_IS_SYSWIDE() (__get_cpu_var(pfm_syst_info) & 0x1)
 #else
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index c381ea954892..93819cca7d96 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -23,7 +23,7 @@ obj-$(CONFIG_IOSAPIC)		+= iosapic.o
 obj-$(CONFIG_MODULES)		+= module.o
 obj-$(CONFIG_SMP)		+= smp.o smpboot.o
 obj-$(CONFIG_NUMA)		+= numa.o
-obj-$(CONFIG_PERFMON)		+= perfmon_default_smpl.o
+obj-$(CONFIG_PERFMON_V20)	+= perfmon_default_smpl.o
 obj-$(CONFIG_IA64_CYCLONE)	+= cyclone.o
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
 obj-$(CONFIG_IA64_MCA_RECOVERY)	+= mca_recovery.o
diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c
index 28d3d483db92..db54bd497cf6 100644
--- a/arch/ia64/kernel/irq_ia64.c
+++ b/arch/ia64/kernel/irq_ia64.c
@@ -40,7 +40,7 @@
 #include <asm/system.h>
 #include <asm/tlbflush.h>
 
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 # include <asm/perfmon.h>
 #endif
 
@@ -660,7 +660,7 @@ init_IRQ (void)
 	}
 #endif
 #endif
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 	pfm_init_percpu();
 #endif
 	platform_irq_init();
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 0e499757309b..5f6efcfa2de4 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -52,7 +52,7 @@
 #include <asm/uaccess.h>
 #include <asm/delay.h>
 
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 /*
  * perfmon context state
  */
@@ -6831,10 +6831,10 @@ pfm_inherit(struct task_struct *task, struct pt_regs *regs)
 	 * the psr bits are already set properly in copy_threads()
 	 */
 }
-#else  /* !CONFIG_PERFMON */
+#else  /* !CONFIG_PERFMON_v20 */
 asmlinkage long
 sys_perfmonctl (int fd, int cmd, void *arg, int count)
 {
 	return -ENOSYS;
 }
-#endif /* CONFIG_PERFMON */
+#endif /* CONFIG_PERFMON_V20 */
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index c57162705147..afbf1a8205ee 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -46,7 +46,7 @@
 
 #include "entry.h"
 
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 # include <asm/perfmon.h>
 #endif
 
@@ -174,7 +174,7 @@ do_notify_resume_user(sigset_t *unused, struct sigscratch *scr, long in_syscall)
 		return;
 	}
 
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 	if (current->thread.pfm_needs_checking)
 		/*
 		 * Note: pfm_handle_work() allow us to call it with interrupts
@@ -334,14 +334,14 @@ cpu_idle (void)
 void
 ia64_save_extra (struct task_struct *task)
 {
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 	unsigned long info;
 #endif
 
 	if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0)
 		ia64_save_debug_regs(&task->thread.dbr[0]);
 
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 	if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0)
 		pfm_save_regs(task);
 
@@ -359,14 +359,14 @@ ia64_save_extra (struct task_struct *task)
 void
 ia64_load_extra (struct task_struct *task)
 {
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 	unsigned long info;
 #endif
 
 	if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0)
 		ia64_load_debug_regs(&task->thread.dbr[0]);
 
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 	if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0)
 		pfm_load_regs(task);
 
@@ -523,7 +523,7 @@ copy_thread (int nr, unsigned long clone_flags,
 	}
 #endif
 
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 	if (current->thread.pfm_context)
 		pfm_inherit(p, child_ptregs);
 #endif
@@ -735,7 +735,7 @@ exit_thread (void)
 {
 
 	ia64_drop_fpu(current);
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
        /* if needed, stop monitoring and flush state to perfmon context */
 	if (current->thread.pfm_context)
 		pfm_exit_thread(current);
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 92c9689b7d97..ffd212fd2d36 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -31,7 +31,7 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/unwind.h>
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 #include <asm/perfmon.h>
 #endif
 
@@ -2105,7 +2105,7 @@ access_uarea(struct task_struct *child, unsigned long addr,
 				"address 0x%lx\n", addr);
 		return -1;
 	}
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 	/*
 	 * Check if debug registers are used by perfmon. This
 	 * test must be done once we know that we can do the
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 1dcbb85fc4ee..f865315a9248 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -381,7 +381,7 @@ smp_callin (void)
 	extern void ia64_init_itm(void);
 	extern volatile int time_keeper_id;
 
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 	extern void pfm_init_percpu(void);
 #endif
 
@@ -411,7 +411,7 @@ smp_callin (void)
 
 	ia64_mca_cmc_vector_setup();	/* Setup vector on AP */
 
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 	pfm_init_percpu();
 #endif
 
diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile
index 98771e2a78af..754f4153123e 100644
--- a/arch/ia64/lib/Makefile
+++ b/arch/ia64/lib/Makefile
@@ -13,7 +13,7 @@ lib-y := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o			\
 
 obj-$(CONFIG_ITANIUM)	+= copy_page.o copy_user.o memcpy.o
 obj-$(CONFIG_MCKINLEY)	+= copy_page_mck.o memcpy_mck.o
-lib-$(CONFIG_PERFMON)	+= carta_random.o
+lib-$(CONFIG_PERFMON_V20) += carta_random.o
 
 AFLAGS___divdi3.o	=
 AFLAGS___udivdi3.o	= -DUNSIGNED
diff --git a/arch/ia64/oprofile/Makefile b/arch/ia64/oprofile/Makefile
index aad27a718ee0..3323fd5a46e9 100644
--- a/arch/ia64/oprofile/Makefile
+++ b/arch/ia64/oprofile/Makefile
@@ -7,4 +7,4 @@ DRIVER_OBJS := $(addprefix ../../../drivers/oprofile/, \
 		timer_int.o )
 
 oprofile-y := $(DRIVER_OBJS) init.o backtrace.o
-oprofile-$(CONFIG_PERFMON) += perfmon.o
+oprofile-$(CONFIG_PERFMON_V20) += perfmon.o
diff --git a/arch/ia64/oprofile/init.c b/arch/ia64/oprofile/init.c
index 31b545c35460..9ed2bc152fba 100644
--- a/arch/ia64/oprofile/init.c
+++ b/arch/ia64/oprofile/init.c
@@ -20,7 +20,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops)
 {
 	int ret = -ENODEV;
 
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 	/* perfmon_init() can fail, but we have no way to report it */
 	ret = perfmon_init(ops);
 #endif
@@ -32,7 +32,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops)
 
 void oprofile_arch_exit(void)
 {
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 	perfmon_exit();
 #endif
 }
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b5e714373385..cdc53491c033 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1516,6 +1516,8 @@ config CMDLINE_OVERRIDE
 	  This is used to work around broken boot loaders.  This should
 	  be set to 'N' under normal conditions.
 
+source "arch/x86/perfmon/Kconfig"
+
 endmenu
 
 config ARCH_ENABLE_MEMORY_HOTPLUG
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index cf72b569db41..f3af2b0b4f15 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -155,6 +155,9 @@ core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
 core-y += arch/x86/kernel/
 core-y += arch/x86/mm/
 
+# perfmon support
+core-$(CONFIG_PERFMON) += arch/x86/perfmon/
+
 # Remaining sub architecture files
 core-y += $(mcore-y)
 
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 256b00b61892..891af3e6b3a6 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -826,4 +826,9 @@ ia32_sys_call_table:
 	.quad sys_dup3			/* 330 */
 	.quad sys_pipe2
 	.quad sys_inotify_init1
+	.quad sys_pfm_create
+	.quad sys_pfm_write
+	.quad sys_pfm_read		/* 335 */
+	.quad sys_pfm_attach
+	.quad sys_pfm_set_state
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 4a8e80cdcfa5..15d495f73485 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -10,6 +10,7 @@ header-y += ptrace-abi.h
 header-y += sigcontext32.h
 header-y += ucontext.h
 header-y += processor-flags.h
+header-y += perfmon.h
 
 unifdef-y += e820.h
 unifdef-y += ist.h
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 0005adb0f941..0ba6dd3aa24e 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -87,6 +87,11 @@
 #define LOCAL_TIMER_VECTOR	0xef
 
 /*
+ * Perfmon PMU interrupt vector
+ */
+#define LOCAL_PERFMON_VECTOR	0xee
+
+/*
  * First APIC vector available to drivers: (vectors 0x30-0xee) we
  * start at 0x31(0x41) to spread out vectors evenly between priority
  * levels. (0x80 is the syscall vector)
diff --git a/arch/x86/include/asm/mach-default/entry_arch.h b/arch/x86/include/asm/mach-default/entry_arch.h
index 6b1add8e31dd..e940722dc1f0 100644
--- a/arch/x86/include/asm/mach-default/entry_arch.h
+++ b/arch/x86/include/asm/mach-default/entry_arch.h
@@ -33,4 +33,8 @@ BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
 BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
 #endif
 
+#ifdef CONFIG_PERFMON
+BUILD_INTERRUPT(pmu_interrupt,LOCAL_PERFMON_VECTOR)
+#endif
+
 #endif
diff --git a/arch/x86/include/asm/perfmon.h b/arch/x86/include/asm/perfmon.h
new file mode 100644
index 000000000000..906f4b24cf0c
--- /dev/null
+++ b/arch/x86/include/asm/perfmon.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This file contains i386/x86_64 specific definitions for the perfmon
+ * interface.
+ *
+ * This file MUST never be included directly. Use linux/perfmon.h.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#ifndef _ASM_X86_PERFMON__H_
+#define _ASM_X86_PERFMON__H_
+
+/*
+ * arch-specific user visible interface definitions
+ */
+
+#define PFM_ARCH_MAX_PMCS	(256+64) /* 256 HW 64 SW */
+#define PFM_ARCH_MAX_PMDS	(256+64) /* 256 HW 64 SW */
+
+#endif /* _ASM_X86_PERFMON_H_ */
diff --git a/arch/x86/include/asm/perfmon_kern.h b/arch/x86/include/asm/perfmon_kern.h
new file mode 100644
index 000000000000..7cadbb894e83
--- /dev/null
+++ b/arch/x86/include/asm/perfmon_kern.h
@@ -0,0 +1,438 @@
+/*
+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Copyright (c) 2007 Advanced Micro Devices, Inc.
+ * Contributed by Robert Richter <robert.richter@amd.com>
+ *
+ * This file contains X86 Processor Family specific definitions
+ * for the perfmon interface. This covers P6, Pentium M, P4/Xeon
+ * (32-bit and 64-bit, i.e., EM64T) and AMD X86-64.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#ifndef _ASM_X86_PERFMON_KERN_H_
+#define _ASM_X86_PERFMON_KERN_H_
+
+#ifdef CONFIG_PERFMON
+#include <linux/unistd.h>
+#ifdef CONFIG_4KSTACKS
+#define PFM_ARCH_STK_ARG	8
+#else
+#define PFM_ARCH_STK_ARG	16
+#endif
+
+struct pfm_arch_pmu_info {
+	u32 flags;		/* PMU feature flags */
+	/*
+	 * mandatory model-specific callbacks
+	 */
+	int  (*stop_save)(struct pfm_context *ctx, struct pfm_event_set *set);
+	int  (*has_ovfls)(struct pfm_context *ctx);
+	void (*quiesce)(void);
+
+	/*
+	 * optional model-specific callbacks
+	 */
+	void (*acquire_pmu_percpu)(void);
+	void (*release_pmu_percpu)(void);
+	int (*load_context)(struct pfm_context *ctx);
+	void (*unload_context)(struct pfm_context *ctx);
+};
+
+/*
+ * PMU feature flags
+ */
+#define PFM_X86_FL_NO_SHARING	0x02	/* no sharing with other subsystems */
+#define PFM_X86_FL_SHARING	0x04	/* PMU is being shared */
+
+struct pfm_x86_ctx_flags {
+	unsigned int insecure:1;  /* rdpmc per-thread self-monitoring */
+	unsigned int reserved:31; /* for future use */
+};
+
+struct pfm_arch_context {
+	u64 saved_real_iip;		/* instr pointer of last NMI intr */
+	struct pfm_x86_ctx_flags flags;	/* flags */
+	int saved_started;
+};
+
+/*
+ * functions implemented as inline on x86
+ */
+
+/**
+ * pfm_arch_write_pmc - write a single PMC register
+ * @ctx: context to work on
+ * @cnum: PMC index
+ * @value: PMC 64-bit value
+ *
+ * in certain situations, ctx may be NULL
+ */
+static inline void pfm_arch_write_pmc(struct pfm_context *ctx,
+				      unsigned int cnum, u64 value)
+{
+	/*
+	 * we only write to the actual register when monitoring is
+	 * active (pfm_start was issued)
+	 */
+	if (ctx && ctx->flags.started == 0)
+		return;
+
+	PFM_DBG_ovfl("pfm_arch_write_pmc(0x%lx, 0x%Lx)",
+		     pfm_pmu_conf->pmc_desc[cnum].hw_addr,
+		     (unsigned long long) value);
+
+	wrmsrl(pfm_pmu_conf->pmc_desc[cnum].hw_addr, value);
+}
+
+/**
+ * pfm_arch_write_pmd - write a single PMD register
+ * @ctx: context to work on
+ * @cnum: PMD index
+ * @value: PMD 64-bit value
+ */
+static inline void pfm_arch_write_pmd(struct pfm_context *ctx,
+				      unsigned int cnum, u64 value)
+{
+	/*
+	 * to make sure the counter overflows, we set the
+	 * upper bits. we also clear any other unimplemented
+	 * bits as this may cause crash on some processors.
+	 */
+	if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_C64)
+		value = (value | ~pfm_pmu_conf->ovfl_mask)
+		      & ~pfm_pmu_conf->pmd_desc[cnum].rsvd_msk;
+
+	PFM_DBG_ovfl("pfm_arch_write_pmd(0x%lx, 0x%Lx)",
+		     pfm_pmu_conf->pmd_desc[cnum].hw_addr,
+		     (unsigned long long) value);
+
+	wrmsrl(pfm_pmu_conf->pmd_desc[cnum].hw_addr, value);
+}
+
+/**
+ * pfm_arch_read_pmd - read a single PMD register
+ * @ctx: context to work on
+ * @cnum: PMD index
+ *
+ * return value is register 64-bit value
+ */
+static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum)
+{
+	u64 tmp;
+
+	rdmsrl(pfm_pmu_conf->pmd_desc[cnum].hw_addr, tmp);
+
+	PFM_DBG_ovfl("pfm_arch_read_pmd(0x%lx) = 0x%Lx",
+		     pfm_pmu_conf->pmd_desc[cnum].hw_addr,
+		     (unsigned long long) tmp);
+	return tmp;
+}
+
+/**
+ * pfm_arch_read_pmc - read a single PMC register
+ * @ctx: context to work on
+ * @cnum: PMC index
+ *
+ * return value is register 64-bit value
+ */
+static inline u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum)
+{
+	u64 tmp;
+
+	rdmsrl(pfm_pmu_conf->pmc_desc[cnum].hw_addr, tmp);
+
+	PFM_DBG_ovfl("pfm_arch_read_pmc(0x%lx) = 0x%016Lx",
+		     pfm_pmu_conf->pmc_desc[cnum].hw_addr,
+		     (unsigned long long) tmp);
+	return tmp;
+}
+
+/**
+ * pfm_arch_is_active - return non-zero is monitoring has been started
+ * @ctx: context to check
+ *
+ * At certain points, perfmon needs to know if monitoring has been
+ * explicitly started.
+ *
+ * On x86, there is not other way but to use pfm_start/pfm_stop
+ * to activate monitoring, thus we can simply check flags.started
+ */
+static inline int pfm_arch_is_active(struct pfm_context *ctx)
+{
+	return ctx->flags.started;
+}
+
+
+/**
+ * pfm_arch_unload_context - detach context from thread or CPU
+ * @ctx: context to detach
+ *
+ * in system-wide ctx->task is NULL, otherwise it points to the
+ * attached thread
+ */
+static inline void pfm_arch_unload_context(struct pfm_context *ctx)
+{
+	struct pfm_arch_pmu_info *pmu_info;
+	struct pfm_arch_context *ctx_arch;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+	pmu_info = pfm_pmu_info();
+
+	if (ctx_arch->flags.insecure) {
+		PFM_DBG("clear cr4.pce");
+		clear_in_cr4(X86_CR4_PCE);
+	}
+
+	if (pmu_info->unload_context)
+		pmu_info->unload_context(ctx);
+}
+
+/**
+ * pfm_arch_load_context - attach context to thread or CPU
+ * @ctx: context to attach
+ */
+static inline int pfm_arch_load_context(struct pfm_context *ctx)
+{
+	struct pfm_arch_pmu_info *pmu_info;
+	struct pfm_arch_context *ctx_arch;
+	int ret = 0;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+	pmu_info = pfm_pmu_info();
+
+	/*
+	 * RDPMC authorized in system-wide and
+	 * per-thread self-monitoring.
+	 *
+	 * RDPMC only gives access to counts.
+	 *
+	 * The context-switch routine code does not restore
+	 * all the PMD registers (optimization), thus there
+	 * is a possible leak of counts there in per-thread
+	 * mode.
+	 */
+	if (ctx->task == current) {
+		PFM_DBG("set cr4.pce");
+		set_in_cr4(X86_CR4_PCE);
+		ctx_arch->flags.insecure = 1;
+	}
+
+	if (pmu_info->load_context)
+		ret = pmu_info->load_context(ctx);
+
+	return ret;
+}
+
+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set);
+void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx);
+void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx);
+
+/**
+ * pfm_arch_intr_freeze_pmu - stop monitoring when handling PMU interrupt
+ * @ctx: current context
+ * @set: current event set
+ *
+ * called from __pfm_interrupt_handler().
+ * ctx is not NULL. ctx is locked. interrupts are masked
+ *
+ * The following actions must take place:
+ *  - stop all monitoring to ensure handler has consistent view.
+ *  - collect overflowed PMDs bitmask into povfls_pmds and
+ *    npend_ovfls. If no interrupt detected then npend_ovfls
+ *    must be set to zero.
+ */
+static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx,
+					    struct pfm_event_set *set)
+{
+	struct pfm_arch_context *ctx_arch;
+	ctx_arch = pfm_ctx_arch(ctx);
+	/*
+	 * on X86, freezing is equivalent to stopping
+	 */
+	pfm_arch_stop(current, ctx);
+
+	/*
+	 * we mark monitoring as stopped to avoid
+	 * certain side effects especially in
+	 * pfm_arch_restore_pmcs()
+	 */
+	ctx_arch->saved_started = ctx->flags.started;
+	ctx->flags.started = 0;
+}
+
+/**
+ * pfm_arch_intr_unfreeze_pmu - conditionally reactive monitoring
+ * @ctx: current context
+ *
+ * current context may be not when dealing when spurious interrupts
+ *
+ * Must re-activate monitoring if context is not MASKED.
+ * interrupts are masked.
+ */
+static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx)
+{
+	struct pfm_arch_context *ctx_arch;
+
+	if (ctx == NULL)
+		return;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+
+	PFM_DBG_ovfl("state=%d", ctx->state);
+
+	/*
+	 * restore flags.started which is cleared in
+	 * pfm_arch_intr_freeze_pmu()
+	 */
+	ctx->flags.started = ctx_arch->saved_started;
+
+	pfm_arch_restore_pmcs(ctx, ctx->active_set);
+}
+
+/**
+ * pfm_arch_ovfl_reset_pmd - reset pmd on overflow
+ * @ctx: current context
+ * @cnum: PMD index
+ *
+ * On some CPUs, the upper bits of a counter must be set in order for the
+ * overflow interrupt to happen. On overflow, the counter has wrapped around,
+ * and the upper bits are cleared. This function may be used to set them back.
+ *
+ * For x86, the current version loses whatever is remaining in the counter,
+ * which is usually has a small count. In order not to loose this count,
+ * we do a read-modify-write to set the upper bits while preserving the
+ * low-order bits. This is slow but works.
+ */
+static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx, unsigned int cnum)
+{
+	u64 val;
+	val = pfm_arch_read_pmd(ctx, cnum);
+	pfm_arch_write_pmd(ctx, cnum, val);
+}
+
+/**
+ * pfm_arch_context_create - create context
+ * @ctx: newly created context
+ * @flags: context flags as passed by user
+ *
+ * called from __pfm_create_context()
+ */
+static inline int pfm_arch_context_create(struct pfm_context *ctx, u32 ctx_flags)
+{
+	return 0;
+}
+
+/**
+ * pfm_arch_context_free - free context
+ * @ctx: context to free
+ */
+static inline void pfm_arch_context_free(struct pfm_context *ctx)
+{}
+
+/*
+ * functions implemented in arch/x86/perfmon/perfmon.c
+ */
+int  pfm_arch_init(void);
+void pfm_arch_resend_irq(struct pfm_context *ctx);
+
+int  pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx);
+void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx);
+
+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set);
+int  pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg);
+void pfm_arch_pmu_config_remove(void);
+char *pfm_arch_get_pmu_module_name(void);
+int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds);
+void pfm_arch_pmu_release(void);
+
+static inline void pfm_arch_serialize(void)
+{}
+
+static inline void pfm_arch_arm_handle_work(struct task_struct *task)
+{}
+
+static inline void pfm_arch_disarm_handle_work(struct task_struct *task)
+{}
+
+#define PFM_ARCH_CTX_SIZE	(sizeof(struct pfm_arch_context))
+/*
+ * x86 does not need extra alignment requirements for the sampling buffer
+ */
+#define PFM_ARCH_SMPL_ALIGN_SIZE	0
+
+asmlinkage void  pmu_interrupt(void);
+
+static inline void pfm_arch_bv_copy(u64 *a, u64 *b, int nbits)
+{
+	bitmap_copy((unsigned long *)a,
+		    (unsigned long *)b,
+		    nbits);
+}
+
+static inline void pfm_arch_bv_or(u64 *a, u64 *b, u64 *c, int nbits)
+{
+	bitmap_or((unsigned long *)a,
+		  (unsigned long *)b,
+		  (unsigned long *)c,
+		  nbits);
+}
+
+static inline void pfm_arch_bv_and(u64 *a, u64 *b, u64 *c, int nbits)
+{
+	bitmap_and((unsigned long *)a,
+		  (unsigned long *)b,
+		  (unsigned long *)c,
+		  nbits);
+}
+
+
+static inline void pfm_arch_bv_zero(u64 *a, int nbits)
+{
+	bitmap_zero((unsigned long *)a, nbits);
+}
+
+static inline int pfm_arch_bv_weight(u64 *a, int nbits)
+{
+	return bitmap_weight((unsigned long *)a, nbits);
+}
+
+static inline void pfm_arch_bv_set_bit(int b, u64 *a)
+{
+	__set_bit(b, (unsigned long *)a);
+}
+
+static inline void pfm_arch_bv_clear_bit(int b, u64 *a)
+{
+	__clear_bit(b, (unsigned long *)a);
+}
+
+static inline int pfm_arch_bv_test_bit(int b, u64 *a)
+{
+	return test_bit(b, (unsigned long *)a);
+}
+
+static inline unsigned long pfm_arch_bv_find_next_bit(const u64 *addr,
+						      unsigned long size,
+						      unsigned long offset)
+{
+	return find_next_bit((unsigned long *)addr,
+			     size,
+			     offset);
+}
+#endif /* CONFIG_PEFMON */
+
+#endif /* _ASM_X86_PERFMON_KERN_H_ */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e44d379faad2..0ddd534bef44 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -79,6 +79,7 @@ struct thread_info {
 #define TIF_SYSCALL_EMU		6	/* syscall emulation active */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
+#define TIF_PERFMON_WORK	9	/* work for pfm_handle_work() */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* 32bit process */
@@ -92,6 +93,7 @@ struct thread_info {
 #define TIF_DEBUGCTLMSR		25	/* uses thread_struct.debugctlmsr */
 #define TIF_DS_AREA_MSR		26      /* uses thread_struct.ds_area_msr */
 #define TIF_BTS_TRACE_TS	27      /* record scheduling event timestamps */
+#define TIF_PERFMON_CTXSW	28	/* perfmon needs ctxsw calls */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -114,6 +116,8 @@ struct thread_info {
 #define _TIF_DEBUGCTLMSR	(1 << TIF_DEBUGCTLMSR)
 #define _TIF_DS_AREA_MSR	(1 << TIF_DS_AREA_MSR)
 #define _TIF_BTS_TRACE_TS	(1 << TIF_BTS_TRACE_TS)
+#define _TIF_PERFMON_WORK	(1<<TIF_PERFMON_WORK)
+#define _TIF_PERFMON_CTXSW	(1<<TIF_PERFMON_CTXSW)
 
 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY	\
@@ -135,12 +139,12 @@ struct thread_info {
 
 /* Only used for 64 bit */
 #define _TIF_DO_NOTIFY_MASK						\
-	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
+	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERFMON_WORK)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
 	(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \
-								_TIF_NOTSC)
+	 _TIF_NOTSC|_TIF_PERFMON_CTXSW)
 
 #define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index f2bba78430a4..06908451002f 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -338,6 +338,11 @@
 #define __NR_dup3		330
 #define __NR_pipe2		331
 #define __NR_inotify_init1	332
+#define __NR_pfm_create		333
+#define __NR_pfm_write		(__NR_pfm_create+1)
+#define __NR_pfm_read		(__NR_pfm_create+2)
+#define __NR_pfm_attach		(__NR_pfm_create+3)
+#define __NR_pfm_set_state	(__NR_pfm_create+4)
 
 #ifdef __KERNEL__
 
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 834b2c1d89fb..a42bb5eb9edb 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -653,7 +653,16 @@ __SYSCALL(__NR_dup3, sys_dup3)
 __SYSCALL(__NR_pipe2, sys_pipe2)
 #define __NR_inotify_init1			294
 __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
-
+#define __NR_pfm_create				295
+__SYSCALL(__NR_pfm_create, sys_pfm_create)
+#define __NR_pfm_write				(__NR_pfm_create+1)
+__SYSCALL(__NR_pfm_write, sys_pfm_write)
+#define __NR_pfm_read				(__NR_pfm_create+2)
+ __SYSCALL(__NR_pfm_read, sys_pfm_read)
+#define __NR_pfm_attach				(__NR_pfm_create+3)
+__SYSCALL(__NR_pfm_attach, sys_pfm_attach)
+#define __NR_pfm_set_state			(__NR_pfm_create+4)
+__SYSCALL(__NR_pfm_set_state, sys_pfm_set_state)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 9134de814c97..9f8826f33032 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -513,7 +513,7 @@ ENDPROC(system_call)
 	ALIGN
 	RING0_PTREGS_FRAME		# can't unwind into user space anyway
 work_pending:
-	testb $_TIF_NEED_RESCHED, %cl
+	testw $(_TIF_NEED_RESCHED|_TIF_PERFMON_WORK), %cx
 	jz work_notifysig
 work_resched:
 	call schedule
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 983d85aeccce..1d9bef0797d9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -876,7 +876,13 @@ END(error_interrupt)
 ENTRY(spurious_interrupt)
 	apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
 END(spurious_interrupt)
-				
+
+#ifdef CONFIG_PERFMON
+ENTRY(pmu_interrupt)
+	apicinterrupt LOCAL_PERFMON_VECTOR,smp_pmu_interrupt
+END(pmu_interrupt)
+#endif
+
 /*
  * Exception entry points.
  */ 		
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index ff0235391285..24a0140e6c36 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -11,6 +11,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/sysdev.h>
 #include <linux/bitops.h>
+#include <linux/perfmon_kern.h>
 
 #include <asm/acpi.h>
 #include <asm/atomic.h>
@@ -224,6 +225,10 @@ void __init native_init_IRQ(void)
 
 	apic_intr_init();
 
+#ifdef CONFIG_PERFMON
+	alloc_intr_gate(LOCAL_PERFMON_VECTOR, pmu_interrupt);
+#endif
+
 	if (!acpi_ioapic)
 		setup_irq(2, &irq2);
 }
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0a1302fe6d45..7ff71d4d6d9b 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -36,6 +36,7 @@
 #include <linux/personality.h>
 #include <linux/tick.h>
 #include <linux/percpu.h>
+#include <linux/perfmon_kern.h>
 #include <linux/prctl.h>
 #include <linux/dmi.h>
 
@@ -258,6 +259,7 @@ void exit_thread(void)
 		ds_free(current->thread.ds_ctx);
 	}
 #endif /* CONFIG_X86_DS */
+	pfm_exit_thread();
 }
 
 void flush_thread(void)
@@ -315,6 +317,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 
 	savesegment(gs, p->thread.gs);
 
+	pfm_copy_thread(p);
+
 	tsk = current;
 	if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
@@ -458,11 +462,17 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 	prev = &prev_p->thread;
 	next = &next_p->thread;
 
+ 	if (test_tsk_thread_flag(prev_p, TIF_PERFMON_CTXSW))
+ 		pfm_ctxsw_out(prev_p, next_p);
+
 	debugctl = update_debugctl(prev, next, prev->debugctlmsr);
 
 	if (next->debugctlmsr != debugctl)
 		update_debugctlmsr(next->debugctlmsr);
 
+	if (test_tsk_thread_flag(next_p, TIF_PERFMON_CTXSW))
+		pfm_ctxsw_in(prev_p, next_p);
+
 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
 		set_debugreg(next->debugreg0, 0);
 		set_debugreg(next->debugreg1, 1);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3180e79c3697..86099f98104a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -37,6 +37,7 @@
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
 #include <linux/tick.h>
+#include <linux/perfmon_kern.h>
 #include <linux/prctl.h>
 #include <linux/uaccess.h>
 #include <linux/io.h>
@@ -255,6 +256,7 @@ void exit_thread(void)
 		ds_free(t->ds_ctx);
 	}
 #endif /* CONFIG_X86_DS */
+	pfm_exit_thread();
 }
 
 void flush_thread(void)
@@ -359,6 +361,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 	savesegment(es, p->thread.es);
 	savesegment(ds, p->thread.ds);
 
+	pfm_copy_thread(p);
+
 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
 		if (!p->thread.io_bitmap_ptr) {
@@ -487,6 +491,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
 	prev = &prev_p->thread,
 	next = &next_p->thread;
 
+	if (test_tsk_thread_flag(prev_p, TIF_PERFMON_CTXSW))
+		pfm_ctxsw_out(prev_p, next_p);
+
 	debugctl = prev->debugctlmsr;
 
 #ifdef CONFIG_X86_DS
@@ -513,6 +520,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
 	if (next->debugctlmsr != debugctl)
 		update_debugctlmsr(next->debugctlmsr);
 
+	if (test_tsk_thread_flag(next_p, TIF_PERFMON_CTXSW))
+		pfm_ctxsw_in(prev_p, next_p);
+
 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
 		loaddebug(next, 0);
 		loaddebug(next, 1);
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 27a5c8174322..7d6fc603dea7 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -19,6 +19,7 @@
 #include <linux/wait.h>
 #include <linux/tracehook.h>
 #include <linux/elf.h>
+#include <linux/perfmon_kern.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
 
@@ -749,6 +750,10 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 		mce_notify_user();
 #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
 
+  	/* process perfmon asynchronous work (e.g. block thread or reset) */
+  	if (thread_info_flags & _TIF_PERFMON_WORK)
+  		pfm_handle_work(regs);
+
 	/* deal with pending signal delivery */
 	if (thread_info_flags & _TIF_SIGPENDING)
 		do_signal(regs);
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index d2307e41fbdb..24e389836fc0 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -21,6 +21,7 @@
 #include <linux/personality.h>
 #include <linux/compiler.h>
 #include <linux/uaccess.h>
+#include <linux/perfmon_kern.h>
 
 #include <asm/processor.h>
 #include <asm/ucontext.h>
@@ -538,6 +539,10 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 		mce_notify_user();
 #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
 
+ 	/* process perfmon asynchronous work (e.g. block thread or reset) */
+ 	if (thread_info_flags & _TIF_PERFMON_WORK)
+ 		pfm_handle_work(regs);
+
 	/* deal with pending signal delivery */
 	if (thread_info_flags & _TIF_SIGPENDING)
 		do_signal(regs);
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d44395ff34c3..81c22739f70b 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -332,3 +332,8 @@ ENTRY(sys_call_table)
 	.long sys_dup3			/* 330 */
 	.long sys_pipe2
 	.long sys_inotify_init1
+	.long sys_pfm_create
+	.long sys_pfm_write
+	.long sys_pfm_read		/* 335 */
+	.long sys_pfm_attach
+	.long sys_pfm_set_state
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 022cd41ea9b4..584a9ef4e44c 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -17,6 +17,7 @@
 #include <linux/moduleparam.h>
 #include <linux/kdebug.h>
 #include <linux/cpu.h>
+#include <linux/perfmon_kern.h>
 #include <asm/nmi.h>
 #include <asm/msr.h>
 #include <asm/apic.h>
@@ -142,12 +143,18 @@ static int nmi_setup(void)
 	int err = 0;
 	int cpu;
 
-	if (!allocate_msrs())
+	if (pfm_session_allcpus_acquire())
+		return -EBUSY;
+
+	if (!allocate_msrs()) {
+		pfm_session_allcpus_release();
 		return -ENOMEM;
+	}
 
 	err = register_die_notifier(&profile_exceptions_nb);
 	if (err) {
 		free_msrs();
+		pfm_session_allcpus_release();
 		return err;
 	}
 
@@ -228,6 +235,7 @@ static void nmi_shutdown(void)
 	msrs = &get_cpu_var(cpu_msrs);
 	model->shutdown(msrs);
 	free_msrs();
+	pfm_session_allcpus_release();
 	put_cpu_var(cpu_msrs);
 }
 
diff --git a/arch/x86/perfmon/Kconfig b/arch/x86/perfmon/Kconfig
new file mode 100644
index 000000000000..8144d1d0d600
--- /dev/null
+++ b/arch/x86/perfmon/Kconfig
@@ -0,0 +1,33 @@
+menu "Hardware Performance Monitoring support"
+config PERFMON
+	bool "Perfmon2 performance monitoring interface"
+	select X86_LOCAL_APIC
+	default n
+	help
+	Enables the perfmon2 interface to access the hardware
+	performance counters. See <http://perfmon2.sf.net/> for
+	more details.
+
+config PERFMON_DEBUG
+	bool "Perfmon debugging"
+	default n
+	depends on PERFMON
+	help
+	Enables perfmon debugging support
+
+config  X86_PERFMON_INTEL_ARCH
+	bool "Support for Intel architectural perfmon v1/v2/v3"
+	depends on PERFMON
+	default n
+	help
+	Enables support for Intel architectural performance counters.
+	This feature was introduced with Intel Core Solo/Core Duo processors.
+
+config	X86_PERFMON_AMD64
+	bool "Support AMD Athlon/Opteron hardware performance counters"
+	depends on PERFMON
+	default n
+	help
+	Enables support for Athlon/Opterton hardware performance counters.
+	Support for  family 6, 15 and 16 processors.
+ endmenu
diff --git a/arch/x86/perfmon/Makefile b/arch/x86/perfmon/Makefile
new file mode 100644
index 000000000000..c0a4ca0da329
--- /dev/null
+++ b/arch/x86/perfmon/Makefile
@@ -0,0 +1,7 @@
+#
+# Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P.
+# Contributed by Stephane Eranian <eranian@hpl.hp.com>
+#
+obj-$(CONFIG_PERFMON)			+= perfmon.o
+obj-$(CONFIG_X86_PERFMON_INTEL_ARCH)	+= perfmon_intel_arch.o
+obj-$(CONFIG_X86_PERFMON_AMD64)		+= perfmon_amd64.o
diff --git a/arch/x86/perfmon/perfmon.c b/arch/x86/perfmon/perfmon.c
new file mode 100644
index 000000000000..844f19dc6cb0
--- /dev/null
+++ b/arch/x86/perfmon/perfmon.c
@@ -0,0 +1,619 @@
+/*
+ * This file implements the X86 specific support for the perfmon2 interface
+ *
+ * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Copyright (c) 2007 Advanced Micro Devices, Inc.
+ * Contributed by Robert Richter <robert.richter@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/interrupt.h>
+#include <linux/perfmon_kern.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/nmi.h>
+
+#include <asm/apic.h>
+
+DEFINE_PER_CPU(unsigned long, real_iip);
+DEFINE_PER_CPU(int, pfm_using_nmi);
+
+/**
+ * pfm_arch_ctxswin_thread - thread context switch in
+ * @task: task switched in
+ * @ctx: context for the task
+ * @set: active event set
+ *
+ * Called from pfm_ctxsw(). Task is guaranteed to be current.
+ * set cannot be NULL. Context is locked. Interrupts are masked.
+ *
+ * Caller has already restored all PMD and PMC registers, if
+ * necessary (i.e., lazy restore scheme).
+ *
+ * On x86, the only common code just needs to unsecure RDPMC if necessary
+ *
+ * On model-specific features, e.g., PEBS, IBS, are taken care of in the
+ * corresponding PMU description module
+ */
+void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx)
+{
+	struct pfm_arch_context *ctx_arch;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+
+	/*
+	 *  restore saved real iip
+	 */
+	if (ctx->active_set->npend_ovfls)
+		__get_cpu_var(real_iip) = ctx_arch->saved_real_iip;
+
+	/*
+	 * enable RDPMC on this CPU
+	 */
+	if (ctx_arch->flags.insecure)
+		set_in_cr4(X86_CR4_PCE);
+}
+
+/**
+ * pfm_arch_ctxswout_thread - context switch out thread
+ * @task: task switched out
+ * @ctx : context switched out
+ *
+ * Called from pfm_ctxsw(). Task is guaranteed to be current.
+ * Context is locked. Interrupts are masked. Monitoring may be active.
+ * PMU access is guaranteed. PMC and PMD registers are live in PMU.
+ *
+ * Return:
+ * 	non-zero : did not save PMDs (as part of stopping the PMU)
+ * 	       0 : saved PMDs (no need to save them in caller)
+ */
+int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx)
+{
+	struct pfm_arch_context *ctx_arch;
+	struct pfm_arch_pmu_info *pmu_info;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+	pmu_info = pfm_pmu_info();
+
+	/*
+	 * disable lazy restore of PMCS on ctxswin because
+	 * we modify some of them.
+	 */
+	ctx->active_set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS;
+
+	if (ctx->active_set->npend_ovfls)
+		ctx_arch->saved_real_iip = __get_cpu_var(real_iip);
+
+	/*
+	 * disable RDPMC on this CPU
+	 */
+	if (ctx_arch->flags.insecure)
+		clear_in_cr4(X86_CR4_PCE);
+
+	return pmu_info->stop_save(ctx, ctx->active_set);
+}
+
+/**
+ * pfm_arch_stop - deactivate monitoring
+ * @task: task to stop
+ * @ctx: context to stop
+ *
+ * Called from pfm_stop()
+ * Interrupts are masked. Context is locked. Set is the active set.
+ *
+ * For per-thread:
+ *   task is not necessarily current. If not current task, then
+ *   task is guaranteed stopped and off any cpu. Access to PMU
+ *   is not guaranteed.
+ *
+ * must disable active monitoring. ctx cannot be NULL
+ */
+void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx)
+{
+	struct pfm_arch_pmu_info *pmu_info;
+
+	pmu_info = pfm_pmu_info();
+
+	/*
+	 * no need to go through stop_save()
+	 * if we are already stopped
+	 */
+	if (!ctx->flags.started)
+		return;
+
+	if (task != current)
+		return;
+
+	pmu_info->stop_save(ctx, ctx->active_set);
+}
+
+
+/**
+ * pfm_arch_start - activate monitoring
+ * @task: task to start
+ * @ctx: context to stop
+ *
+ * Interrupts are masked. Context is locked.
+ *
+ * For per-thread:
+ * 	Task is not necessarily current. If not current task, then task
+ * 	is guaranteed stopped and off any cpu. No access to PMU is task
+ *	is not current.
+ */
+void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx)
+{
+	/*
+	 * cannot restore PMC if no access to PMU. Will be done
+	 * when the thread is switched back in
+	 */
+	if (task != current)
+		return;
+
+	pfm_arch_restore_pmcs(ctx, ctx->active_set);
+}
+
+/**
+ * pfm_arch_restore_pmds - reload PMD registers
+ * @ctx: context to restore from
+ * @set: current event set
+ *
+ * function called from pfm_context_load(), pfm_ctxsw()
+ *
+ * Context is locked. Interrupts are masked. Set cannot be NULL.
+ * Access to the PMU is guaranteed.
+ */
+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set)
+{
+	u16 i, num;
+
+	num = set->nused_pmds;
+
+	/*
+	 * we can restore only the PMD we use because:
+	 *
+	 * 	- can only read with pfm_read_pmds() the registers
+	 * 	  declared used via pfm_write_pmds()
+	 *
+	 * 	- if cr4.pce=1, only counters are exposed to user. RDPMC
+	 * 	  does not work with other types of PMU registers.Thus, no
+	 * 	  address is ever exposed by counters
+	 *
+	 * 	- there is never a dependency between one pmd register and
+	 * 	  another
+	 */
+	for (i = 0; num; i++) {
+		if (likely(pfm_arch_bv_test_bit(i, set->used_pmds))) {
+			pfm_write_pmd(ctx, i, set->pmds[i]);
+			num--;
+		}
+	}
+}
+
+/**
+ * pfm_arch_restore_pmcs - reload PMC registers
+ * @ctx: context to restore from
+ * @set: current event set
+ *
+ * function called from pfm_context_load(), pfm_ctxsw().
+ *
+ * Context is locked. Interrupts are masked. set cannot be NULL.
+ * Access to the PMU is guaranteed.
+ *
+ * function must restore all PMC registers from set
+ */
+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set)
+{
+	u16 i, num;
+
+	/*
+	 * we need to restore PMCs only when:
+	 * 	- context is not masked
+	 * 	- monitoring activated
+	 *
+	 * Masking monitoring after an overflow does not change the
+	 * value of flags.started
+	 */
+	if (!ctx->flags.started)
+		return;
+
+	/*
+	 * restore all pmcs
+	 *
+	 * It is not possible to restore only the pmcs we used because
+	 * certain PMU models (e.g. Pentium 4) have dependencies. Thus
+	 * we do not want one application using stale PMCs coming from
+	 * another one.
+	 *
+	 * On PMU models where there is no dependencies between PMCs, then
+	 * it is possible to optimize by only restoring the registers that
+	 * are used, but this has to be done by model-specific code.
+	 */
+	num = ctx->regs.num_pmcs;
+	for (i = 0; num; i++) {
+		if (pfm_arch_bv_test_bit(i, ctx->regs.pmcs)) {
+			pfm_arch_write_pmc(ctx, i, set->pmcs[i]);
+			num--;
+		}
+	}
+}
+
+/**
+ * smp_pmu_interrupt - lowest level PMU interrupt handler for X86
+ * @regs: machine state
+ *
+ * The PMU interrupt is handled through an interrupt gate, therefore
+ * the CPU automatically clears the EFLAGS.IF, i.e., masking interrupts.
+ *
+ * The perfmon interrupt handler MUST run with interrupts disabled due
+ * to possible race with other, higher priority interrupts, such as timer
+ * or IPI function calls.
+ *
+ * See description in IA-32 architecture manual, Vol 3 section 5.8.1
+ */
+void smp_pmu_interrupt(struct pt_regs *regs)
+{
+	unsigned long iip;
+	int using_nmi;
+
+	using_nmi = __get_cpu_var(pfm_using_nmi);
+
+	ack_APIC_irq();
+
+	irq_enter();
+
+	/*
+	 * when using NMI, pfm_handle_nmi() gets called
+	 * first. It stops monitoring and record the
+	 * iip into real_iip, then it repost the interrupt
+	 * using the lower priority vector LOCAL_PERFMON_VECTOR
+	 *
+	 * On some processors, e.g., P4, it may be that some
+	 * state is already recorded from pfm_handle_nmi()
+	 * and it only needs to be copied back into the normal
+	 * fields so it can be used transparently by higher level
+	 * code.
+	 */
+	if (using_nmi)
+		iip = __get_cpu_var(real_iip);
+	else
+		iip = instruction_pointer(regs);
+
+	pfm_interrupt_handler(iip, regs);
+
+	/*
+	 * On Intel processors:
+	 * 	- it is necessary to clear the MASK field for the LVTPC
+	 * 	  vector. Otherwise interrupts remain masked. See
+	 * 	  section 8.5.1
+	 * AMD X86-64:
+	 * 	- the documentation does not stipulate the behavior but
+	 * 	  it seems to work without the write, so we skip
+	 */
+	if (!using_nmi && current_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		apic_write(APIC_LVTPC, LOCAL_PERFMON_VECTOR);
+
+	irq_exit();
+}
+
+/**
+ * pfm_handle_nmi - PMU NMI handler notifier callback
+ * @nb ; notifier block
+ * @val: type of die notifier
+ * @data: die notifier-specific data
+ *
+ * called from notify_die() notifier from an trap handler path. We only
+ * care about NMI related callbacks, and ignore everything else.
+ *
+ * Cannot grab any locks, include the perfmon context lock
+ *
+ * Must detect if NMI interrupt comes from perfmon, and if so it must
+ * stop the PMU and repost a lower-priority interrupt. The perfmon interrupt
+ * handler needs to grab the context lock, thus is cannot be run directly
+ * from the NMI interrupt call path.
+ */
+static int __kprobes pfm_handle_nmi(struct notifier_block *nb,
+				    unsigned long val,
+				    void *data)
+{
+	struct die_args *args = data;
+	struct pfm_context *ctx;
+	struct pfm_arch_pmu_info *pmu_info;
+
+	/*
+	 * only NMI related calls
+	 */
+	if (val != DIE_NMI_IPI)
+		return NOTIFY_DONE;
+
+	/*
+	 * perfmon not using NMI
+	 */
+	if (!__get_cpu_var(pfm_using_nmi))
+		return NOTIFY_DONE;
+
+	/*
+	 * No context
+	 */
+	ctx = __get_cpu_var(pmu_ctx);
+	if (!ctx) {
+		PFM_DBG_ovfl("no ctx");
+		return NOTIFY_DONE;
+	}
+
+	/*
+	 * Detect if we have overflows, i.e., NMI interrupt
+	 * caused by PMU
+	 */
+	pmu_info = pfm_pmu_info();
+	if (!pmu_info->has_ovfls(ctx)) {
+		PFM_DBG_ovfl("no ovfl");
+		return NOTIFY_DONE;
+	}
+
+	/*
+	 * we stop the PMU to avoid further overflow before this
+	 * one is treated by lower priority interrupt handler
+	 */
+	pmu_info->quiesce();
+
+	/*
+	 * record actual instruction pointer
+	 */
+	__get_cpu_var(real_iip) = instruction_pointer(args->regs);
+
+	/*
+	 * post lower priority interrupt (LOCAL_PERFMON_VECTOR)
+	 */
+	pfm_arch_resend_irq(ctx);
+
+	/*
+	 * we need to rewrite the APIC vector on Intel
+	 */
+	if (current_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+	/*
+	 * the notification was for us
+	 */
+	return NOTIFY_STOP;
+}
+
+static struct notifier_block pfm_nmi_nb = {
+	.notifier_call = pfm_handle_nmi
+};
+
+/**
+ * pfm_arch_resend_irq - post perfmon interrupt on regular vector
+ *
+ * called from pfm_ctxswin_thread() and pfm_handle_nmi()
+ */
+void pfm_arch_resend_irq(struct pfm_context *ctx)
+{
+	unsigned long val, dest;
+	/*
+	 * we cannot use hw_resend_irq() because it goes to
+	 * the I/O APIC. We need to go to the Local APIC.
+	 *
+	 * The "int vec" is not the right solution either
+	 * because it triggers a software intr. We need
+	 * to regenerate the interrupt and have it pended
+	 * until we unmask interrupts.
+	 *
+	 * Instead we send ourself an IPI on the perfmon
+	 * vector.
+	 */
+	val  = APIC_DEST_SELF|APIC_INT_ASSERT|
+	       APIC_DM_FIXED|LOCAL_PERFMON_VECTOR;
+
+	dest = apic_read(APIC_ID);
+	apic_write(APIC_ICR2, dest);
+	apic_write(APIC_ICR, val);
+}
+
+/**
+ * pfm_arch_pmu_acquire_percpu - setup APIC per CPU
+ * @data: contains pmu flags
+ */
+static void pfm_arch_pmu_acquire_percpu(void *data)
+{
+	struct pfm_arch_pmu_info *pmu_info;
+	unsigned int tmp, vec;
+	unsigned long flags = (unsigned long)data;
+	unsigned long lvtpc;
+
+	pmu_info = pfm_pmu_conf->pmu_info;
+	/*
+	 * we only reprogram the LVTPC vector if we have detected
+	 * no sharing, otherwise it means the APIC is already programmed
+	 * and we use whatever vector (likely NMI) is there
+	 */
+	if (!(flags & PFM_X86_FL_SHARING)) {
+		vec = LOCAL_PERFMON_VECTOR;
+
+		tmp = apic_read(APIC_LVTERR);
+		apic_write(APIC_LVTERR, tmp | APIC_LVT_MASKED);
+		apic_write(APIC_LVTPC, vec);
+		apic_write(APIC_LVTERR, tmp);
+	}
+	lvtpc = (unsigned long)apic_read(APIC_LVTPC);
+
+	__get_cpu_var(pfm_using_nmi) = lvtpc == APIC_DM_NMI;
+
+	PFM_DBG("LTVPC=0x%lx using_nmi=%d",
+			lvtpc, __get_cpu_var(pfm_using_nmi));
+	/*
+	 * invoke model specific acquire routine.
+	 */
+	if (pmu_info->acquire_pmu_percpu)
+		pmu_info->acquire_pmu_percpu();
+}
+
+/**
+ * pfm_arch_pmu_acquire - acquire PMU resource from system
+ * @unavail_pmcs : bitmask to use to set unavailable pmcs
+ * @unavail_pmds : bitmask to use to set unavailable pmds
+ *
+ * interrupts are not masked
+ *
+ * Grab PMU registers from lower level MSR allocator
+ *
+ * Program the APIC according the possible interrupt vector
+ * either LOCAL_PERFMON_VECTOR or NMI
+ */
+int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds)
+{
+	struct pfm_arch_pmu_info *pmu_info;
+	struct pfm_regmap_desc *d;
+	u16 i, nlost;
+
+	pmu_info = pfm_pmu_conf->pmu_info;
+	pmu_info->flags &= ~PFM_X86_FL_SHARING;
+
+	nlost = 0;
+
+	d = pfm_pmu_conf->pmc_desc;
+	for (i = 0; i < pfm_pmu_conf->num_pmc_entries;  i++, d++) {
+		if (!(d->type & PFM_REG_I))
+			continue;
+
+		/*
+		 * reserve register with lower-level allocator
+		 */
+		if (!reserve_evntsel_nmi(d->hw_addr)) {
+			PFM_DBG("pmc%d(%s) already used", i, d->desc);
+			pfm_arch_bv_set_bit(i, unavail_pmcs);
+			nlost++;
+			continue;
+		}
+	}
+	PFM_DBG("nlost=%d info_flags=0x%x\n", nlost, pmu_info->flags);
+	/*
+	 * some PMU models (e.g., P6) do not support sharing
+	 * so check if we found less than the expected number of PMC registers
+	 */
+	if (nlost) {
+		if (pmu_info->flags & PFM_X86_FL_NO_SHARING) {
+			PFM_INFO("PMU already used by another subsystem, "
+				 "PMU does not support sharing, "
+				 "try disabling Oprofile or "
+				 "reboot with nmi_watchdog=0");
+			goto undo;
+		}
+		pmu_info->flags |= PFM_X86_FL_SHARING;
+	}
+
+	d = pfm_pmu_conf->pmd_desc;
+	for (i = 0; i < pfm_pmu_conf->num_pmd_entries;  i++, d++) {
+		if (!(d->type & PFM_REG_I))
+			continue;
+
+		if (!reserve_perfctr_nmi(d->hw_addr)) {
+			PFM_DBG("pmd%d(%s) already used", i, d->desc);
+			pfm_arch_bv_set_bit(i, unavail_pmds);
+		}
+	}
+	/*
+	 * program APIC on each CPU
+	 */
+	on_each_cpu(pfm_arch_pmu_acquire_percpu,
+		    (void *)(unsigned long)pmu_info->flags , 1);
+
+	return 0;
+undo:
+	/*
+	 * must undo reservation of pmcs in case of error
+	 */
+	d = pfm_pmu_conf->pmc_desc;
+	for (i = 0; i < pfm_pmu_conf->num_pmc_entries;  i++, d++) {
+		if (!(d->type & PFM_REG_I))
+			continue;
+		if (!pfm_arch_bv_test_bit(i, unavail_pmcs))
+			release_evntsel_nmi(d->hw_addr);
+	}
+	return -EBUSY;
+}
+
+/**
+ * pfm-arch_pmu_release_percpu - clear NMI state for one CPU
+ *
+ */
+static void pfm_arch_pmu_release_percpu(void *data)
+{
+	struct pfm_arch_pmu_info *pmu_info;
+
+	pmu_info = pfm_pmu_conf->pmu_info;
+
+	__get_cpu_var(pfm_using_nmi) = 0;
+	/*
+	 * invoke model specific release routine.
+	 */
+	if (pmu_info->release_pmu_percpu)
+		pmu_info->release_pmu_percpu();
+}
+
+/**
+ * pfm_arch_pmu_release - release PMU resource to system
+ *
+ * called from pfm_pmu_release()
+ * interrupts are not masked
+ *
+ * On x86, we return the PMU registers to the MSR allocator
+ */
+void pfm_arch_pmu_release(void)
+{
+	struct pfm_regmap_desc *d;
+	u16 i, n;
+
+	d = pfm_pmu_conf->pmc_desc;
+	n = pfm_pmu_conf->regs_all.num_pmcs;
+	for (i = 0; n; i++, d++) {
+		if (!pfm_arch_bv_test_bit(i, pfm_pmu_conf->regs_all.pmcs))
+			continue;
+		release_evntsel_nmi(d->hw_addr);
+		n--;
+		PFM_DBG("pmc%u released", i);
+	}
+	d = pfm_pmu_conf->pmd_desc;
+	n = pfm_pmu_conf->regs_all.num_pmds;
+	for (i = 0; n; i++, d++) {
+		if (!pfm_arch_bv_test_bit(i, pfm_pmu_conf->regs_all.pmds))
+			continue;
+		release_perfctr_nmi(d->hw_addr);
+		n--;
+		PFM_DBG("pmd%u released", i);
+	}
+
+	/* clear NMI variable if used */
+	if (__get_cpu_var(pfm_using_nmi))
+		on_each_cpu(pfm_arch_pmu_release_percpu, NULL , 1);
+}
+
+/**
+ * pfm_arch_init - one time global arch-specific initialization
+ *
+ * called from pfm_init()
+ */
+int __init pfm_arch_init(void)
+{
+	/*
+	 * we need to register our NMI handler when the kernels boots
+	 * to avoid a deadlock condition with the NMI watchdog or Oprofile
+	 * if we were to try and register/unregister on-demand.
+	 */
+	register_die_notifier(&pfm_nmi_nb);
+	return 0;
+}
diff --git a/arch/x86/perfmon/perfmon_amd64.c b/arch/x86/perfmon/perfmon_amd64.c
new file mode 100644
index 000000000000..f078fe28137d
--- /dev/null
+++ b/arch/x86/perfmon/perfmon_amd64.c
@@ -0,0 +1,483 @@
+/*
+ * This file contains the PMU description for the Athlon64 and Opteron64
+ * processors. It supports 32 and 64-bit modes.
+ *
+ * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Copyright (c) 2007 Advanced Micro Devices, Inc.
+ * Contributed by Robert Richter <robert.richter@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#include <linux/kprobes.h>
+#include <linux/vmalloc.h>
+#include <linux/topology.h>
+#include <linux/pci.h>
+#include <linux/perfmon_kern.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+
+static void __kprobes pfm_amd64_quiesce(void);
+static int pfm_amd64_has_ovfls(struct pfm_context *ctx);
+static int pfm_amd64_stop_save(struct pfm_context *ctx,
+			       struct pfm_event_set *set);
+
+static u64 enable_mask[PFM_MAX_PMCS];
+static u16 max_enable;
+
+static struct pfm_arch_pmu_info pfm_amd64_pmu_info = {
+	.stop_save = pfm_amd64_stop_save,
+	.has_ovfls = pfm_amd64_has_ovfls,
+	.quiesce = pfm_amd64_quiesce,
+};
+
+/*
+ * force Local APIC interrupt on overflow
+ */
+#define PFM_K8_VAL	(1ULL<<20)
+#define PFM_K8_NO64	(1ULL<<20)
+
+/*
+ * reserved bits must be 1
+ *
+ * for family 15:
+ * - upper 32 bits are reserved
+ * - bit 20, bit 21
+ *
+ * for family 16:
+ * - bits 36-39 are reserved
+ * - bits 42-63 are reserved
+ * - bit 20, bit 21
+ *
+ */
+#define PFM_K8_RSVD 	((~((1ULL<<32)-1)) | (1ULL<<20) | (1ULL<<21))
+#define PFM_16_RSVD ((0x3fffffULL<<42) | (0xfULL<<36) | (1ULL<<20) | (1ULL<<21))
+
+static struct pfm_regmap_desc pfm_amd64_pmc_desc[] = {
+/* pmc0  */ PMC_D(PFM_REG_I64, "PERFSEL0", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL0),
+/* pmc1  */ PMC_D(PFM_REG_I64, "PERFSEL1", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL1),
+/* pmc2  */ PMC_D(PFM_REG_I64, "PERFSEL2", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL2),
+/* pmc3  */ PMC_D(PFM_REG_I64, "PERFSEL3", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL3),
+};
+#define PFM_AMD_NUM_PMCS ARRAY_SIZE(pfm_amd64_pmc_desc)
+
+/*
+ * AMD64 counters are 48 bits, upper bits are reserved
+ */
+#define PFM_AMD64_CTR_RSVD	(~((1ULL<<48)-1))
+
+#define PFM_AMD_D(n) \
+	{ .type = PFM_REG_C,			\
+	  .desc = "PERFCTR"#n,			\
+	  .hw_addr = MSR_K7_PERFCTR0+n,		\
+	  .rsvd_msk = PFM_AMD64_CTR_RSVD,	\
+	  .dep_pmcs[0] = 1ULL << n		\
+	}
+
+static struct pfm_regmap_desc pfm_amd64_pmd_desc[] = {
+/* pmd0  */ PFM_AMD_D(0),
+/* pmd1  */ PFM_AMD_D(1),
+/* pmd2  */ PFM_AMD_D(2),
+/* pmd3  */ PFM_AMD_D(3)
+};
+#define PFM_AMD_NUM_PMDS ARRAY_SIZE(pfm_amd64_pmd_desc)
+
+static struct pfm_context *pfm_nb_task_owner;
+
+static struct pfm_pmu_config pfm_amd64_pmu_conf;
+
+/**
+ * pfm_amd64_acquire_nb -- ensure mutual exclusion for Northbridge events
+ * @ctx: context to use
+ *
+ * There can only be one user per socket for the Northbridge (NB) events,
+ * so we enforce mutual exclusion as follows:
+ * 	- per-thread : only one context machine-wide can use NB events
+ *
+ * Exclusion is enforced at:
+ * 	- pfm_load_context()
+ * 	- pfm_write_pmcs() for attached contexts
+ *
+ * Exclusion is released at:
+ * 	- pfm_unload_context() or any calls that implicitely uses it
+ *
+ * return:
+ * 	0  : successfully acquire NB access
+ * 	< 0:  errno, failed to acquire NB access
+ */
+static int pfm_amd64_acquire_nb(struct pfm_context *ctx)
+{
+	struct pfm_context **entry, *old;
+	int proc_id;
+
+#ifdef CONFIG_SMP
+	proc_id = cpu_data(smp_processor_id()).phys_proc_id;
+#else
+	proc_id = 0;
+#endif
+
+	entry = &pfm_nb_task_owner;
+
+	old = cmpxchg(entry, NULL, ctx);
+	if (!old) {
+		PFM_DBG("acquired Northbridge event access globally");
+	} else if (old != ctx) {
+		PFM_DBG("global NorthBridge event conflict");
+		return -EBUSY;
+	}
+	return 0;
+}
+
+/**
+ * pfm_amd64_pmc_write_check -- check validity of pmc writes
+ * @ctx: context to use
+ * @set: event set to use
+ * @req: user request to modify the pmc
+ *
+ * invoked from pfm_write_pmcs() when pfm_nb_sys_owners is not NULL,i.e.,
+ * when we have detected a multi-core processor.
+ *
+ * context is locked, interrupts are masked
+ */
+static int pfm_amd64_pmc_write_check(struct pfm_context *ctx,
+			     struct pfm_event_set *set,
+			     struct pfarg_pmr *req)
+{
+	unsigned int event;
+
+	/*
+	 * delay checking NB event until we load the context
+	 */
+	if (ctx->state == PFM_CTX_UNLOADED)
+		return 0;
+
+	/*
+	 * check event is NB event
+	 */
+	event = (unsigned int)(req->reg_value & 0xff);
+	if (event < 0xee)
+		return 0;
+
+	return pfm_amd64_acquire_nb(ctx);
+}
+
+/**
+ * pfm_amd64_load_context - amd64 model-specific load callback
+ * @ctx: context to use
+ *
+ * invoked on pfm_load_context().
+ * context is locked, interrupts are masked
+ */
+static int pfm_amd64_load_context(struct pfm_context *ctx)
+{
+	struct pfm_event_set *set;
+	unsigned int i, n;
+
+	set = ctx->active_set;
+	n = set->nused_pmcs;
+	for (i = 0; n; i++) {
+		if (!pfm_arch_bv_test_bit(i, set->used_pmcs))
+			continue;
+
+		if ((set->pmcs[i] & 0xff) >= 0xee)
+			goto found;
+		n--;
+	}
+	return 0;
+found:
+	return pfm_amd64_acquire_nb(ctx);
+}
+
+/**
+ * pfm_amd64_unload_context -- amd64 mdoels-specific unload callback
+ * @ctx: context to use
+ *
+ * invoked on pfm_unload_context()
+ */
+static void pfm_amd64_unload_context(struct pfm_context *ctx)
+{
+	struct pfm_context **entry, *old;
+	int proc_id;
+
+#ifdef CONFIG_SMP
+	proc_id = cpu_data(smp_processor_id()).phys_proc_id;
+#else
+	proc_id = 0;
+#endif
+
+	entry = &pfm_nb_task_owner;
+
+	old = cmpxchg(entry, ctx, NULL);
+	if (old == ctx)
+		PFM_DBG("released NorthBridge events globally");
+}
+
+/**
+ * pfm_amd64_setup_nb_event_ctrl -- initialize NB event controls
+ *
+ * detect if we need to activate NorthBridge event access control
+ */
+static int pfm_amd64_setup_nb_event_ctrl(void)
+{
+	unsigned int c, n = 0;
+	unsigned int max_phys = 0;
+
+#ifdef CONFIG_SMP
+	for_each_possible_cpu(c) {
+		if (cpu_data(c).phys_proc_id > max_phys)
+			max_phys = cpu_data(c).phys_proc_id;
+	}
+#else
+	max_phys = 0;
+#endif
+	if (max_phys > 255) {
+		PFM_INFO("socket id %d is too big to handle", max_phys);
+		return -ENOMEM;
+	}
+
+	n = max_phys + 1;
+	if (n < 2)
+		return 0;
+
+	pfm_nb_task_owner = NULL;
+
+	/*
+	 * activate write-checker for PMC registers
+	 */
+	for (c = 0; c < PFM_AMD_NUM_PMCS; c++)
+		pfm_amd64_pmc_desc[c].type |= PFM_REG_WC;
+
+	pfm_amd64_pmu_info.load_context = pfm_amd64_load_context;
+	pfm_amd64_pmu_info.unload_context = pfm_amd64_unload_context;
+
+	pfm_amd64_pmu_conf.pmc_write_check = pfm_amd64_pmc_write_check;
+
+	PFM_INFO("NorthBridge event access control enabled");
+
+	return 0;
+}
+
+/**
+ * pfm_amd64_setup_register -- initialize register table
+ *
+ * modify register table based on actual host CPU
+ */
+static void pfm_amd64_setup_registers(void)
+{
+	u16 i;
+
+	pfm_arch_bv_set_bit(0, enable_mask);
+	pfm_arch_bv_set_bit(1, enable_mask);
+	pfm_arch_bv_set_bit(2, enable_mask);
+	pfm_arch_bv_set_bit(3, enable_mask);
+	max_enable = 3+1;
+
+	/*
+	 * adjust reserved bit fields for family 16
+	 */
+	if (current_cpu_data.x86 == 16) {
+		for (i = 0; i < PFM_AMD_NUM_PMCS; i++)
+			if (pfm_amd64_pmc_desc[i].rsvd_msk == PFM_K8_RSVD)
+				pfm_amd64_pmc_desc[i].rsvd_msk = PFM_16_RSVD;
+	}
+}
+
+/**
+ * pfm_amd64_probe_pmu -- detect host PMU
+ */
+static int pfm_amd64_probe_pmu(void)
+{
+	if (current_cpu_data.x86_vendor != X86_VENDOR_AMD)
+		return -1;
+
+	switch (current_cpu_data.x86) {
+	case  6:
+	case 15:
+	case 16:
+		PFM_INFO("found family=%d", current_cpu_data.x86);
+		break;
+	default:
+		PFM_INFO("unsupported family=%d", current_cpu_data.x86);
+		return -1;
+	}
+
+	/*
+	 * check for local APIC (required)
+	 */
+	if (!cpu_has_apic) {
+		PFM_INFO("no local APIC, unsupported");
+		return -1;
+	}
+
+	if (current_cpu_data.x86_max_cores > 1
+	    && pfm_amd64_setup_nb_event_ctrl())
+		return -1;
+
+	pfm_amd64_setup_registers();
+
+	return 0;
+}
+
+/**
+ * pfm_amd64_has_ovfls -- detect if pending overflows
+ * @ctx: context to use
+ *
+ * detect is counters have overflowed.
+ * return:
+ * 	0 : no overflow
+ * 	1 : at least one overflow
+ */
+static int __kprobes pfm_amd64_has_ovfls(struct pfm_context *ctx)
+{
+	struct pfm_regmap_desc *xrd;
+	u64 *cnt_mask;
+	u64 wmask, val;
+	u16 i, num;
+
+	/*
+	 * Check regular counters
+	 */
+	cnt_mask = ctx->regs.cnt_pmds;
+	num = ctx->regs.num_counters;
+	wmask = 1ULL << pfm_pmu_conf->counter_width;
+	xrd = pfm_amd64_pmd_desc;
+
+	for (i = 0; num; i++) {
+		if (pfm_arch_bv_test_bit(i, cnt_mask)) {
+			rdmsrl(xrd[i].hw_addr, val);
+			if (!(val & wmask))
+				return 1;
+			num--;
+		}
+	}
+	return 0;
+}
+
+/**
+ * pfm_amd64_stop_save - stop monitoring, collect pending overflows
+ * @ctx: context to use
+ * @set: event set to stop
+ *
+ * interrupts are masked, PMU access guaranteed
+ */
+static int pfm_amd64_stop_save(struct pfm_context *ctx,
+			       struct pfm_event_set *set)
+{
+	struct pfm_arch_pmu_info *pmu_info;
+	u64 used_mask[PFM_PMC_BV];
+	u64 *cnt_pmds;
+	u64 val, wmask, ovfl_mask;
+	u32 i, count;
+
+	pmu_info = pfm_pmu_info();
+
+	wmask = 1ULL << pfm_pmu_conf->counter_width;
+
+	pfm_arch_bv_and(used_mask,
+		        set->used_pmcs,
+		        enable_mask,
+		        max_enable);
+
+	count = pfm_arch_bv_weight(used_mask, max_enable);
+
+	/*
+	 * stop monitoring
+	 * Unfortunately, this is very expensive!
+	 * wrmsrl() is serializing.
+	 */
+	for (i = 0; count; i++) {
+		if (pfm_arch_bv_test_bit(i, used_mask)) {
+			wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0);
+			count--;
+		}
+	}
+
+	/*
+	 * if we already having a pending overflow condition, we simply
+	 * return to take care of this first.
+	 */
+	if (set->npend_ovfls)
+		return 1;
+
+	ovfl_mask = pfm_pmu_conf->ovfl_mask;
+	cnt_pmds = ctx->regs.cnt_pmds;
+
+	/*
+	 * check for pending overflows and save PMDs (combo)
+	 * we employ used_pmds because we also need to save
+	 * and not just check for pending interrupts.
+	 */
+	count = set->nused_pmds;
+	for (i = 0; count; i++) {
+		if (pfm_arch_bv_test_bit(i, set->used_pmds)) {
+			val = pfm_arch_read_pmd(ctx, i);
+			if (likely(pfm_arch_bv_test_bit(i, cnt_pmds))) {
+				if (!(val & wmask)) {
+					pfm_arch_bv_set_bit(i,set->povfl_pmds);
+					set->npend_ovfls++;
+				}
+				val = (set->pmds[i] & ~ovfl_mask)
+				    | (val & ovfl_mask);
+			}
+			set->pmds[i] = val;
+			count--;
+		}
+	}
+	/* 0 means: no need to save PMDs at upper level */
+	return 0;
+}
+
+/**
+ * pfm_amd64_quiesce_pmu -- stop monitoring without grabbing any lock
+ *
+ * called from NMI interrupt handler to immediately stop monitoring
+ * cannot grab any lock, including perfmon related locks
+ */
+static void __kprobes pfm_amd64_quiesce(void)
+{
+	/*
+	 * quiesce PMU by clearing available registers that have
+	 * the start/stop capability
+	 */
+	if (pfm_arch_bv_test_bit(0, pfm_pmu_conf->regs_all.pmcs))
+		wrmsrl(MSR_K7_EVNTSEL0, 0);
+	if (pfm_arch_bv_test_bit(1, pfm_pmu_conf->regs_all.pmcs))
+		wrmsrl(MSR_K7_EVNTSEL0+1, 0);
+	if (pfm_arch_bv_test_bit(2, pfm_pmu_conf->regs_all.pmcs))
+		wrmsrl(MSR_K7_EVNTSEL0+2, 0);
+	if (pfm_arch_bv_test_bit(3, pfm_pmu_conf->regs_all.pmcs))
+		wrmsrl(MSR_K7_EVNTSEL0+3, 0);
+}
+
+static struct pfm_pmu_config pfm_amd64_pmu_conf = {
+	.pmu_name = "AMD64",
+	.counter_width = 47,
+	.pmd_desc = pfm_amd64_pmd_desc,
+	.pmc_desc = pfm_amd64_pmc_desc,
+	.num_pmc_entries = PFM_AMD_NUM_PMCS,
+	.num_pmd_entries = PFM_AMD_NUM_PMDS,
+	.version = "1.2",
+	.pmu_info = &pfm_amd64_pmu_info
+};
+
+static int __init pfm_amd64_pmu_init_module(void)
+{
+	if (pfm_amd64_probe_pmu())
+		return -ENOSYS;
+	return pfm_pmu_register(&pfm_amd64_pmu_conf);
+}
+
+device_initcall(pfm_amd64_pmu_init_module);
diff --git a/arch/x86/perfmon/perfmon_intel_arch.c b/arch/x86/perfmon/perfmon_intel_arch.c
new file mode 100644
index 000000000000..ce4293dcfcda
--- /dev/null
+++ b/arch/x86/perfmon/perfmon_intel_arch.c
@@ -0,0 +1,628 @@
+/*
+ * This file contains the Intel architectural perfmon v1, v2, v3
+ * description tables.
+ *
+ * Architectural perfmon was introduced with Intel Core Solo/Duo
+ * processors.
+ *
+ * Copyright (c) 2006-2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kprobes.h>
+#include <linux/perfmon_kern.h>
+#include <asm/msr.h>
+#include <asm/apic.h>
+
+static u64 enable_mask[PFM_MAX_PMCS];
+static u16 max_enable;
+static int pfm_intel_arch_version;
+
+DEFINE_PER_CPU(u64, saved_global_ctrl);
+
+/*
+ * - upper 32 bits are reserved
+ * - INT: APIC enable bit is reserved (forced to 1)
+ * - bit 21 is reserved
+ *
+ * RSVD: reserved bits are 1
+ */
+#define PFM_IA_PMC_RSVD	((~((1ULL<<32)-1)) \
+			| (1ULL<<20) \
+			| (1ULL<<21))
+
+/*
+ * force Local APIC interrupt on overflow
+ * disable with NO_EMUL64
+ */
+#define PFM_IA_PMC_VAL	(1ULL<<20)
+#define PFM_IA_NO64	(1ULL<<20)
+
+/*
+ * architectuture specifies that:
+ * IA32_PMCx MSR        : starts at 0x0c1 & occupy a contiguous block of MSR
+ * IA32_PERFEVTSELx MSR : starts at 0x186 & occupy a contiguous block of MSR
+ * MSR_GEN_FIXED_CTR0   : starts at 0x309 & occupy a contiguous block of MSR
+ */
+#define MSR_GEN_SEL_BASE	MSR_P6_EVNTSEL0
+#define MSR_GEN_PMC_BASE	MSR_P6_PERFCTR0
+#define MSR_GEN_FIXED_PMC_BASE	MSR_CORE_PERF_FIXED_CTR0
+
+/*
+ * layout of EAX for CPUID.0xa leaf function
+ */
+struct pmu_eax {
+	unsigned int version:8;		/* architectural perfmon version */
+	unsigned int num_cnt:8; 	/* number of generic counters */
+	unsigned int cnt_width:8;	/* width of generic counters */
+	unsigned int ebx_length:8;	/* number of architected events */
+};
+
+/*
+ * layout of EDX for CPUID.0xa leaf function when perfmon v2 is detected
+ */
+struct pmu_edx {
+	unsigned int num_cnt:5;		/* number of fixed counters */
+	unsigned int cnt_width:8;	/* width of fixed counters */
+	unsigned int reserved:19;
+};
+
+static void pfm_intel_arch_acquire_pmu_percpu(void);
+static void pfm_intel_arch_release_pmu_percpu(void);
+static int pfm_intel_arch_stop_save(struct pfm_context *ctx,
+				    struct pfm_event_set *set);
+static int pfm_intel_arch_has_ovfls(struct pfm_context *ctx);
+static void __kprobes pfm_intel_arch_quiesce(void);
+
+/*
+ * physical addresses of MSR controlling the perfevtsel and counter registers
+ */
+struct pfm_arch_pmu_info pfm_intel_arch_pmu_info = {
+	.stop_save = pfm_intel_arch_stop_save,
+	.has_ovfls = pfm_intel_arch_has_ovfls,
+	.quiesce = pfm_intel_arch_quiesce,
+	.acquire_pmu_percpu = pfm_intel_arch_acquire_pmu_percpu,
+	.release_pmu_percpu = pfm_intel_arch_release_pmu_percpu
+};
+
+#define PFM_IA_C(n) {                   \
+	.type = PFM_REG_I64,            \
+	.desc = "PERFEVTSEL"#n,         \
+	.dfl_val = PFM_IA_PMC_VAL,      \
+	.rsvd_msk = PFM_IA_PMC_RSVD,    \
+	.no_emul64_msk = PFM_IA_NO64,   \
+	.hw_addr = MSR_GEN_SEL_BASE+(n) \
+	}
+
+#define PFM_IA_D(n) \
+	{ .type = PFM_REG_C,			\
+	  .desc = "PMC"#n,			\
+	  .hw_addr = MSR_P6_PERFCTR0+n,		\
+	  .dep_pmcs[0] = 1ULL << n		\
+	}
+
+#define PFM_IA_FD(n) \
+	{ .type = PFM_REG_C,			\
+	  .desc = "FIXED_CTR"#n,		\
+	  .hw_addr = MSR_CORE_PERF_FIXED_CTR0+n,\
+	  .dep_pmcs[0] = 1ULL << 16		\
+	}
+
+
+static struct pfm_regmap_desc pfm_intel_arch_pmc_desc[] = {
+/* pmc0  */ PFM_IA_C(0),  PFM_IA_C(1),   PFM_IA_C(2),  PFM_IA_C(3),
+/* pmc4  */ PFM_IA_C(4),  PFM_IA_C(5),   PFM_IA_C(6),  PFM_IA_C(7),
+/* pmc8  */ PFM_IA_C(8),  PFM_IA_C(9),  PFM_IA_C(10), PFM_IA_C(11),
+/* pmc12 */ PFM_IA_C(12), PFM_IA_C(13), PFM_IA_C(14), PFM_IA_C(15),
+
+/* pmc16 */ { .type = PFM_REG_I,
+	      .desc = "FIXED_CTRL",
+	      .dfl_val = 0x8888888888888888ULL, /* force PMI */
+	      .rsvd_msk = 0, /* set dynamically */
+	      .no_emul64_msk = 0,
+	      .hw_addr = MSR_CORE_PERF_FIXED_CTR_CTRL
+	    },
+};
+#define PFM_IA_MAX_PMCS	ARRAY_SIZE(pfm_intel_arch_pmc_desc)
+
+static struct pfm_regmap_desc pfm_intel_arch_pmd_desc[] = {
+/* pmd0  */  PFM_IA_D(0),  PFM_IA_D(1),  PFM_IA_D(2),  PFM_IA_D(3),
+/* pmd4  */  PFM_IA_D(4),  PFM_IA_D(5),  PFM_IA_D(6),  PFM_IA_D(7),
+/* pmd8  */  PFM_IA_D(8),  PFM_IA_D(9), PFM_IA_D(10), PFM_IA_D(11),
+/* pmd12 */ PFM_IA_D(12), PFM_IA_D(13), PFM_IA_D(14), PFM_IA_D(15),
+
+/* pmd16 */ PFM_IA_FD(0), PFM_IA_FD(1), PFM_IA_FD(2), PFM_IA_FD(3),
+/* pmd20 */ PFM_IA_FD(4), PFM_IA_FD(5), PFM_IA_FD(6), PFM_IA_FD(7),
+/* pmd24 */ PFM_IA_FD(8), PFM_IA_FD(9), PFM_IA_FD(10), PFM_IA_FD(11),
+/* pmd28 */ PFM_IA_FD(16), PFM_IA_FD(17), PFM_IA_FD(18), PFM_IA_FD(19)
+};
+#define PFM_IA_MAX_PMDS	ARRAY_SIZE(pfm_intel_arch_pmd_desc)
+
+#define PFM_IA_MAX_CNT		16 /* # generic counters in mapping table */
+#define PFM_IA_MAX_FCNT		16 /* # of fixed counters in mapping table */
+#define PFM_IA_FCNT_BASE	16 /* base index of fixed counters PMD */
+
+static struct pfm_pmu_config pfm_intel_arch_pmu_conf;
+
+static void pfm_intel_arch_check_errata(void)
+{
+	/*
+	 * Core Duo errata AE49 (no fix). Both counters share a single
+	 * enable bit in PERFEVTSEL0
+	 */
+	if (current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 14)
+		pfm_intel_arch_pmu_info.flags |= PFM_X86_FL_NO_SHARING;
+}
+
+static inline void set_enable_mask(unsigned int i)
+{
+	pfm_arch_bv_set_bit(i, enable_mask);
+
+	/* max_enable = highest + 1 */
+	if ((i+1) > max_enable)
+		max_enable = i+ 1;
+}
+
+static void pfm_intel_arch_setup_generic(unsigned int version,
+					 unsigned int width,
+					 unsigned int count)
+{
+	u64 rsvd;
+	unsigned int i;
+
+	/*
+	 * first we handle the generic counters:
+	 *
+	 * - ensure HW does not have more registers than hardcoded in the tables
+	 * - adjust rsvd_msk to actual counter width
+	 * - initialize enable_mask (list of PMC with start/stop capability)
+	 * - mark unused hardcoded generic counters as unimplemented
+	 */
+
+	/*
+	 * min of number of Hw counters and hardcoded in the tables
+	 */
+	if (count >= PFM_IA_MAX_CNT) {
+		printk(KERN_INFO "perfmon: Limiting number of generic counters"
+				 " to %u, HW supports %u",
+				 PFM_IA_MAX_CNT, count);
+		count = PFM_IA_MAX_CNT;
+	}
+
+	/*
+	 * adjust rsvd_msk for generic counters based on actual width
+	 * initialize enable_mask (1 per pmd)
+	 */
+	rsvd = ~((1ULL << width)-1);
+	for (i = 0; i < count; i++) {
+		pfm_intel_arch_pmd_desc[i].rsvd_msk = rsvd;
+		set_enable_mask(i);
+	}
+
+	/*
+	 * handle version 3 new anythread bit (21)
+	 */
+	if (version == 3) {
+		for (i = 0; i < count; i++)
+			pfm_intel_arch_pmc_desc[i].rsvd_msk &= ~(1ULL << 21);
+	}
+
+
+	/*
+	 * mark unused generic counters as not available
+	 */
+	for (i = count ; i < PFM_IA_MAX_CNT; i++) {
+		pfm_intel_arch_pmd_desc[i].type = PFM_REG_NA;
+		pfm_intel_arch_pmc_desc[i].type = PFM_REG_NA;
+	}
+}
+
+static void pfm_intel_arch_setup_fixed(unsigned int version,
+				       unsigned int width,
+				       unsigned int count)
+{
+	u64 rsvd, dfl;
+	unsigned int i;
+
+	/*
+	 * handle the fixed counters (if any):
+	 *
+	 * - ensure HW does not have more registers than hardcoded in the tables
+	 * - adjust rsvd_msk to actual counter width
+	 * - initialize enable_mask (list of PMC with start/stop capability)
+	 * - mark unused hardcoded generic counters as unimplemented
+	 */
+	if (count >= PFM_IA_MAX_FCNT) {
+		printk(KERN_INFO "perfmon: Limiting number of fixed counters"
+				 " to %u, HW supports %u",
+				 PFM_IA_MAX_FCNT, count);
+		count = PFM_IA_MAX_FCNT;
+	}
+	/*
+	 * adjust rsvd_msk for fixed counters based on actual width
+	 */
+	rsvd = ~((1ULL << width)-1);
+	for (i = 0; i < count; i++)
+		pfm_intel_arch_pmd_desc[PFM_IA_FCNT_BASE+i].rsvd_msk = rsvd;
+
+	/*
+	 * handle version new anythread bit (bit 2)
+	 */
+	if (version == 3)
+		rsvd = 1ULL << 3;
+	else
+		rsvd = 3ULL << 2;
+
+	pfm_intel_arch_pmc_desc[16].rsvd_msk = 0;
+	for (i = 0; i < count; i++)
+		pfm_intel_arch_pmc_desc[16].rsvd_msk |= rsvd << (i<<2);
+
+	/*
+	 * mark unused fixed counters as unimplemented
+	 *
+	 * update the rsvd_msk, dfl_val in FIXED_CTRL:
+	 * 	- rsvd_msk: set all 4 bits
+	 *	- dfl_val : clear all 4 bits
+	 */
+	dfl = pfm_intel_arch_pmc_desc[16].dfl_val;
+	rsvd = pfm_intel_arch_pmc_desc[16].rsvd_msk;
+
+	for (i = count ; i < PFM_IA_MAX_FCNT; i++) {
+		pfm_intel_arch_pmd_desc[PFM_IA_FCNT_BASE+i].type = PFM_REG_NA;
+		rsvd |= 0xfULL << (i<<2);
+		dfl &= ~(0xfULL << (i<<2));
+	}
+
+	/*
+	 * FIXED_CTR_CTRL unavailable when no fixed counters are defined
+	 */
+	if (!count) {
+		pfm_intel_arch_pmc_desc[16].type = PFM_REG_NA;
+	} else {
+		/* update rsvd_mask and dfl_val */
+		pfm_intel_arch_pmc_desc[16].rsvd_msk = rsvd;
+		pfm_intel_arch_pmc_desc[16].dfl_val = dfl;
+		set_enable_mask(16);
+	}
+}
+
+static int pfm_intel_arch_probe_pmu(void)
+{
+	union {
+		unsigned int val;
+		struct pmu_eax eax;
+		struct pmu_edx edx;
+	} eax, edx;
+	unsigned int ebx, ecx;
+	unsigned int width = 0;
+
+	edx.val = 0;
+
+	if (!cpu_has_arch_perfmon) {
+		PFM_INFO("no support for Intel architectural PMU");
+		return -1;
+	}
+
+	if (!cpu_has_apic) {
+		PFM_INFO("no Local APIC, try rebooting with lapic option");
+		return -1;
+	}
+
+	/* cpuid() call protected by cpu_has_arch_perfmon */
+	cpuid(0xa, &eax.val, &ebx, &ecx, &edx.val);
+
+	/*
+	 * some 6/15 models have buggy BIOS
+	 */
+	if (eax.eax.version == 0
+	    && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 15) {
+		PFM_INFO("buggy v2 BIOS, adjusting for 2 generic counters");
+		eax.eax.version = 2;
+		eax.eax.num_cnt = 2;
+		eax.eax.cnt_width = 40;
+	}
+
+	/*
+	 * some v2 BIOSes are incomplete
+	 */
+	if (eax.eax.version == 2 && !edx.edx.num_cnt) {
+		PFM_INFO("buggy v2 BIOS, adjusting for 3 fixed counters");
+		edx.edx.num_cnt = 3;
+		edx.edx.cnt_width = 40;
+	}
+
+	/*
+	 * no fixed counters on earlier versions
+	 */
+	if (eax.eax.version < 2) {
+		edx.val = 0;
+	} else {
+		/*
+		 * use the min value of both widths until we support
+		 * variable width counters
+		 */
+		width = eax.eax.cnt_width < edx.edx.cnt_width ?
+			eax.eax.cnt_width : edx.edx.cnt_width;
+	}
+
+ 	/*
+	 * Intel Atom processors have a buggy firmware which does not report
+	 * the correct number of fixed counters
+	 */
+	if (eax.eax.version == 3 && edx.edx.num_cnt < 3
+	    && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 28) {
+		PFM_INFO("buggy v3 BIOS, adjusting for 3 fixed counters");
+		edx.edx.num_cnt = 3;
+	}
+
+	PFM_INFO("detected architecural perfmon v%d", eax.eax.version);
+	PFM_INFO("num_gen=%d width=%d num_fixed=%d width=%d",
+		  eax.eax.num_cnt,
+		  eax.eax.cnt_width,
+		  edx.edx.num_cnt,
+		  edx.edx.cnt_width);
+
+	pfm_intel_arch_setup_generic(eax.eax.version,
+				     width,
+				     eax.eax.num_cnt);
+
+	pfm_intel_arch_setup_fixed(eax.eax.version,
+				   width,
+				   edx.edx.num_cnt);
+
+	pfm_intel_arch_check_errata();
+
+	pfm_intel_arch_version = eax.eax.version;
+
+	return 0;
+}
+
+/**
+ * pfm_intel_arch_has_ovfls - check for pending overflow condition
+ * @ctx: context to work on
+ *
+ * detect if counters have overflowed.
+ * return:
+ * 	0 : no overflow
+ * 	1 : at least one overflow
+ */
+static int __kprobes pfm_intel_arch_has_ovfls(struct pfm_context *ctx)
+{
+	u64 *cnt_mask;
+	u64 wmask, val;
+	u16 i, num;
+
+	cnt_mask = ctx->regs.cnt_pmds;
+	num = ctx->regs.num_counters;
+	wmask = 1ULL << pfm_pmu_conf->counter_width;
+
+	/*
+	 * we can leverage the fact that we know the mapping
+	 * to hardcode the MSR address and avoid accessing
+	 * more cachelines
+	 *
+	 * We need to check cnt_mask because not all registers
+	 * may be available.
+	 */
+	for (i = 0; num; i++) {
+		if (pfm_arch_bv_test_bit(i, cnt_mask)) {
+			rdmsrl(pfm_intel_arch_pmd_desc[i].hw_addr, val);
+			if (!(val & wmask))
+				return 1;
+			num--;
+		}
+	}
+	return 0;
+}
+
+static int pfm_intel_arch_stop_save(struct pfm_context *ctx,
+				    struct pfm_event_set *set)
+{
+	u64 used_mask[PFM_PMC_BV];
+	u64 val, wmask, ovfl_mask;
+	u32 i, count;
+
+	wmask = 1ULL << pfm_pmu_conf->counter_width;
+
+	pfm_arch_bv_and(used_mask,
+			set->used_pmcs,
+			enable_mask,
+			max_enable);
+
+	count = pfm_arch_bv_weight(used_mask, max_enable);
+
+	/*
+	 * stop monitoring
+	 * Unfortunately, this is very expensive!
+	 * wrmsrl() is serializing.
+	 */
+	for (i = 0; count; i++) {
+		if (pfm_arch_bv_test_bit(i, used_mask)) {
+			wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0);
+			count--;
+		}
+	}
+
+	/*
+	 * if we already having a pending overflow condition, we simply
+	 * return to take care of this first.
+	 */
+	if (set->npend_ovfls)
+		return 1;
+
+	ovfl_mask = pfm_pmu_conf->ovfl_mask;
+
+	/*
+	 * check for pending overflows and save PMDs (combo)
+	 * we employ used_pmds because we also need to save
+	 * and not just check for pending interrupts.
+	 *
+	 * all pmds are counters
+	 */
+	count = set->nused_pmds;
+	for (i = 0; count; i++) {
+		if (pfm_arch_bv_test_bit(i, set->used_pmds)) {
+			val = pfm_arch_read_pmd(ctx, i);
+			if (!(val & wmask)) {
+				pfm_arch_bv_set_bit(i, set->povfl_pmds);
+				set->npend_ovfls++;
+			}
+			val = (set->pmds[i] & ~ovfl_mask)
+				| (val & ovfl_mask);
+			set->pmds[i] = val;
+			count--;
+		}
+	}
+	/* 0 means: no need to save PMDs at upper level */
+	return 0;
+}
+
+/**
+ * pfm_intel_arch_quiesce - stop monitoring without grabbing any lock
+ *
+ * called from NMI interrupt handler to immediately stop monitoring
+ * cannot grab any lock, including perfmon related locks
+ */
+static void __kprobes pfm_intel_arch_quiesce(void)
+{
+	u16 i;
+
+	/*
+	 * PMC16 is the fixed control register so it has a
+	 * distinct MSR address
+	 *
+	 * We do not use the hw_addr field in the table to avoid touching
+	 * too many cachelines
+	 */
+	for (i = 0; i < pfm_pmu_conf->regs_all.max_pmc; i++) {
+		if (pfm_arch_bv_test_bit(i, pfm_pmu_conf->regs_all.pmcs)) {
+			if (i == 16)
+				wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
+			else
+				wrmsrl(MSR_P6_EVNTSEL0+i, 0);
+		}
+	}
+}
+/**
+* pfm_intel_arch_release_pmu_percpu - release PMU resource per CPU
+*
+* Since v2, there exists global control MSR, to start/stop and
+* also collect overflow status information. In particular,
+* GLOBAL_CTRL controls start/stop and has one bit per counter.
+* To maintain backward compatibility with v1, the power-on value
+* of GLOBAL_CTRL should be such that generic counters are enabled
+* but fixed counters are disabled (true on Penryn and Atom currently).
+*
+* Here, we simply make sure that all available counters are enabled.
+* After that, start/stop is controlled on a per-counter basis.
+*/
+static void pfm_intel_arch_acquire_pmu_percpu(void)
+{
+	struct pfm_regmap_desc *d;
+	u64 mask = 0;
+	unsigned int i;
+
+	/* nothing to do for v1 */
+	if (pfm_intel_arch_version < 2)
+		return;
+
+	/*
+	 * build bitmask of registers that are available to
+	 * us. In some cases, there may be fewer registers than
+	 * what the PMU supports due to sharing with other kernel
+	 * subsystems, such as NMI
+	 */
+	d = pfm_pmu_conf->pmd_desc;
+	for (i=0; i < 16; i++) {
+		if ((d[i].type & PFM_REG_I) == 0)
+			continue;
+		mask |= 1ull << i;
+	}
+	for (i=16; i < PFM_IA_MAX_PMDS; i++) {
+		if ((d[i].type & PFM_REG_I) == 0)
+			continue;
+		mask |= 1ull << (32+i-16);
+	}
+	/*
+	 * keep a local copy of the current MSR_CORE_PERF_GLOBAL_CTRL
+	 */
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, __get_cpu_var(saved_global_ctrl));
+
+	PFM_DBG("global=0x%llx set to 0x%llx",
+		__get_cpu_var(saved_global_ctrl),
+		mask);
+	/*
+	 * enable all registers
+	 *
+	 * No need to quiesce PMU. If there is a overflow, it will be
+	 * treated as spurious by the handler
+	 */
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, mask);
+}
+
+/**
+* pfm_intel_arch_release_pmu_percpu - release PMU resource per CPU
+*
+* Since v2, there exists global control MSR, to start/stop and
+* also collect overflow status information. In particular,
+* GLOBAL_CTRL controls start/stop and has one bit per counter.
+* To maintain backward compatibility with v1, the power-on value
+* of GLOBAL_CTRL should be such that generic counters are enabled
+* but fixed counters are disabled (true on Penryn and Atom currently).
+*
+* Here, we are done using the PMU. so we restore the power-on value.
+*/
+static void pfm_intel_arch_release_pmu_percpu(void)
+{
+	/* nothing to do for v1 */
+	if (pfm_intel_arch_version < 2)
+		return;
+
+	PFM_DBG("global_ctrl restored to 0x%llx\n",
+		__get_cpu_var(saved_global_ctrl));
+
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, __get_cpu_var(saved_global_ctrl));
+}
+
+/*
+ * Counters may have model-specific width. Yet the documentation says
+ * that only the lower 32 bits can be written to due to the specification
+ * of wrmsr. bits [32-(w-1)] are sign extensions of bit 31. Bits [w-63] must
+ * not be set (see rsvd_msk for PMDs). As such the effective width of a
+ * counter is 31 bits only regardless of what CPUID.0xa returns.
+ *
+ * See IA-32 Intel Architecture Software developer manual Vol 3B chapter 18
+ */
+static struct pfm_pmu_config pfm_intel_arch_pmu_conf = {
+	.pmu_name = "Intel architectural",
+	.pmd_desc = pfm_intel_arch_pmd_desc,
+	.counter_width   = 31,
+	.num_pmc_entries = PFM_IA_MAX_PMCS,
+	.num_pmd_entries = PFM_IA_MAX_PMDS,
+	.pmc_desc = pfm_intel_arch_pmc_desc,
+	.version = "1.0",
+	.pmu_info = &pfm_intel_arch_pmu_info
+};
+
+static int __init pfm_intel_arch_pmu_init_module(void)
+{
+	if (pfm_intel_arch_probe_pmu())
+		return -ENOSYS;
+
+	return pfm_pmu_register(&pfm_intel_arch_pmu_conf);
+}
+
+device_initcall(pfm_intel_arch_pmu_init_module);