Merge commit 'perfmon3/master'

author: Stephen Rothwell <sfr@canb.auug.org.au> 2008-11-11 18:00:33 +1100
committer: Stephen Rothwell <sfr@canb.auug.org.au> 2008-11-11 18:00:33 +1100
commit: 6bfea1858872c1e57d94d686e3144bfa10ca48cb (patch)
tree: cfe7ad66c1f9a14f9d419c3ebf3100264b0044d4
parent: bc6435afdc6a1e0c5236a4a031f372bc1c62341d (diff)
parent: 4872c7055867a9b583c76cd7744030dd515a5f35 (diff)
67 files changed, 7196 insertions, 37 deletions
diff --git a/Documentation/ABI/testing/sysfs-perfmon b/Documentation/ABI/testing/sysfs-perfmon
new file mode 100644
index 000000000000..79c66b59ec5b
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-perfmon
@@ -0,0 +1,42 @@
+What:		/sys/kernel/perfmon
+Date:		Oct 2008
+KernelVersion:	2.6.27
+Contact:	eranian@gmail.com
+
+Description:	provide the configuration interface for the perfmon subsystems.
+	        The tree contains information about the detected hardware,
+		current state of the subsystem as well as some configuration
+		parameters.
+
+		The tree consists of the following entries:
+
+	/sys/kernel/perfmon/debug (read-write):
+
+		Enable perfmon debugging output. The traces are rate-limited
+		to avoid flooding the console. It is possible to change the
+		throttling via /proc/sys/kernel/printk_ratelimit.
+
+		The value is interpreted as a bitmask.  Each bit enables a
+		particular type of debug messages. Refer to the file
+		include/linux/perfmon_kern.h for more information.
+
+	/sys/kernel/perfmon/task_group (read-write):
+
+		Users group allowed to create a per-thread context (session).
+   		-1 means any group.
+
+	/sys/kernel/perfmon/task_sessions_count (read-only):
+
+		Number of per-thread contexts (sessions) currently attached
+		to threads.
+
+   	/sys/kernel/perfmon/version (read-only):
+
+		Perfmon interface revision number.
+
+	/sys/kernel/perfmon/arg_mem_max(read-write):
+
+		Maximum size of vector arguments expressed in bytes.
+		It can be modified but must be at least a page.
+		Default: PAGE_SIZE
+
diff --git a/Documentation/ABI/testing/sysfs-perfmon-pmu b/Documentation/ABI/testing/sysfs-perfmon-pmu
new file mode 100644
index 000000000000..2fa5a7ca8e8b
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-perfmon-pmu
@@ -0,0 +1,48 @@
+What:		/sys/kernel/perfmon/pmu
+Date:		Nov 2007
+KernelVersion:	2.6.24
+Contact:	eranian@gmail.com
+
+Description:	Provides information about the active PMU description
+		module.  The module contains the mapping of the actual
+		performance counter registers onto the logical PMU exposed by
+		perfmon.  There is at most one PMU description module loaded
+		at any time.
+
+		The sysfs PMU tree provides a description of the mapping for
+		each register. There is one subdir per config and data register
+		along an entry for the name of the PMU model.
+
+		The entries are as follows:
+
+	/sys/kernel/perfmon/pmu_desc/model (read-only):
+
+		Name of the PMU model is clear text and zero terminated.
+
+	Then, for each logical PMU register, XX, gets a subtree with the
+	following entries:
+
+	/sys/kernel/perfmon/pmu_desc/pm*XX/addr (read-only):
+
+		The physical address or index of the actual underlying hardware
+		register.  On Itanium, it corresponds to the index. But on X86
+		processor, this is the actual MSR address.
+
+	/sys/kernel/perfmon/pmu_desc/pm*XX/dfl_val (read-only):
+
+		The default value of the register in hexadecimal.
+
+	/sys/kernel/perfmon/pmu_desc/pm*XX/name (read-only):
+
+		The name of the hardware register.
+
+	/sys/kernel/perfmon/pmu_desc/pm*XX/rsvd_msk (read-only):
+
+		Bitmask of reserved bits, i.e., bits which cannot be changed
+		by applications. When a bit is set, it means the corresponding
+		bit in the actual register is reserved.
+
+	/sys/kernel/perfmon/pmu_desc/pm*XX/width (read-only):
+
+		The width in bits of the registers. This field is only
+		relevant for counter registers.
diff --git a/Documentation/perfmon.txt b/Documentation/perfmon.txt
new file mode 100644
index 000000000000..818c53770e8b
--- /dev/null
+++ b/Documentation/perfmon.txt
@@ -0,0 +1,206 @@
+              The perfmon hardware monitoring interface
+              ------------------------------------------
+		           Stephane Eranian
+			  <eranian@gmail.com>
+
+I/ Introduction
+
+   The perfmon interface provides access to the hardware performance counters
+   of major processors. Nowadays, all processors implement some flavor of
+   performance counters which capture micro-architectural level information
+   such as the number of elapsed cycles, number of cache misses, and so on.
+
+   The interface is implemented as a set of new system calls and a set of
+   config files in /sys.
+
+   It is possible to monitor a single thread or a CPU. In either mode,
+   applications can count or sample. System-wide monitoring is supported by
+   running a monitoring session on each CPU. The interface supports event-based
+   sampling where the sampling period is expressed as the number of occurrences
+   of event, instead of just a timeout. This approach provides a better
+   granularity and flexibility.
+
+   For performance reason, it is possible to use a kernel-level sampling buffer
+   to minimize the overhead incurred by sampling. The format of the buffer,
+   what is recorded, how it is recorded, and how it is exported to user is
+   controlled by a kernel module called a sampling format. The current
+   implementation comes with a default format but it is possible to create
+   additional formats. There is an kernel registration interface for formats.
+   Each format is identified by a simple string which a tool can pass when a
+   monitoring session is created.
+
+   The interface also provides support for event set and multiplexing to work
+   around hardware limitations in the number of available counters or in how
+   events can be combined. Each set defines as many counters as the hardware
+   can support. The kernel then multiplexes the sets. The interface supports
+   time-based switching but also overflow-based switching, i.e., after n
+   overflows of designated counters.
+
+   Applications never manipulates the actual performance counter registers.
+   Instead they see a logical Performance Monitoring Unit (PMU) composed of a
+   set of config registers (PMC) and a set of data registers (PMD). Note that
+   PMD are not necessarily counters, they can be buffers. The logical PMU is
+   then mapped onto the actual PMU using a mapping table which is implemented
+   as a kernel module. The mapping is chosen once for each new processor. It is
+   visible in /sys/kernel/perfmon/pmu_desc. The kernel module is automatically
+   loaded on first use.
+
+   A monitoring session is uniquely identified by a file descriptor obtained
+   when the session is created. File sharing semantics apply to access the
+   session inside a process. A session is never inherited across fork. The file
+   descriptor can be used to receive counter overflow notifications or when the
+   sampling buffer is full. It is possible to use poll/select on the descriptor
+   to wait for notifications from multiple sessions. Similarly, the descriptor
+   supports asynchronous notifications via SIGIO.
+
+   Counters are always exported as being 64-bit wide regardless of what the
+   underlying hardware implements.
+
+II/ Kernel compilation
+
+    To enable perfmon, you need to enable CONFIG_PERFMON and also some of the
+    model-specific PMU modules.
+
+III/ OProfile interactions
+
+    The set of features offered by perfmon is rich enough to support migrating
+    Oprofile on top of it. That means that PMU programming and low-level
+    interrupt handling could be done by perfmon. The Oprofile sampling buffer
+    management code in the kernel as well as how samples are exported to users
+    could remain through the use of a sampling format. This is how Oprofile
+    works on Itanium.
+
+    The current interactions with Oprofile are:
+	- on X86: Both subsystems can be compiled into the same kernel. There
+		  is enforced mutual exclusion between the two subsystems. When
+		  there is an Oprofile session, no perfmon session can exist
+		  and vice-versa.
+
+	- On IA-64: Oprofile works on top of perfmon. Oprofile being a
+		    system-wide monitoring tool, the regular per-thread vs.
+		    system-wide session restrictions apply.
+
+	- on PPC: no integration yet. Only one subsystem can be enabled.
+	- on MIPS: no integration yet.  Only one subsystem can be enabled.
+
+IV/ User tools
+
+    We have released a simple monitoring tool to demonstrate the features of
+    the interface. The tool is called pfmon and it comes with a simple helper
+    library called libpfm. The library comes with a set of examples to show
+    how to use the kernel interface. Visit http://perfmon2.sf.net for details.
+
+    There maybe other tools available for perfmon.
+
+V/ How to program?
+
+   The best way to learn how to program perfmon, is to take a look at the
+   source code for the examples in libpfm. The source code is available from:
+
+		http://perfmon2.sf.net
+
+VI/ System calls overview
+
+   In this section, we describe the state of the interface as submitted to the
+   kernel. There are more extensions available, and we will update the section
+   as they get implemented in the upstream kernel.
+
+   The interface is implemented by the following system calls:
+
+   * int pfm_create(int flags, pfarg_sinfo_t *s);
+
+      This function creates a perfmon per-thread session.
+      The flags parameter is currently unused and must be set to 0.
+
+      Upon return and if s is not NULL, the kernel return the list of available
+      PMC and PMD registers. Tools should not assume, they have access to the
+      entire PMU, it may be shared with other kernel subsystems, e.g., on X86
+      the NMI watchdog timer.
+
+      The function returns the file descriptor identifying the session.
+
+   * int pfm_write(int fd, int flags, int type, void *d, size_t sz)
+
+      This function is used to write PMU registers for the session identified
+      by fd.
+
+      The flags parameter is currently unused and must be set to 0.
+
+      The type reflects the type of registers to write and determines the type
+      of the d parameter. The following types are defined:
+
+         - PFM_RW_PMC: write PMC registers, expect pfarg_pmr_t pointer for d
+         - PFM_RW_PMD: write PMD registers, expect pfarg_pmr_t pointer for d
+
+     The type field is not a bitmask, only one type can be passed per call.
+
+     the sz parameter describes the size of the vector of elements passed in d.
+
+   * int pfm_read(int fd, int flags, int type, void *d, size_t sz);
+
+      This function is used to read PMU registers for the session identified
+      by fd.
+
+      This function is used to write PMU registers for the session identified
+      by fd.
+
+      The flags parameter is currently unused and must be set to 0.
+
+      The type reflects the type of registers to write and determines the type
+      of the d parameter. The following types are supported:
+
+         - PFM_RW_PMD: write PMD registers, expect pfarg_pmr_t pointer for d
+
+     The type field is not a bitmask, only one type can be passed per call.
+
+     Reading of PMC registers is not allowed.
+
+     the sz parameter describes the size of the vector of elements passed in d.
+
+
+   * int pfm_attach(int fd, int flags, int target);
+
+      This function is used to attach and detach the session to and from
+      thread.
+
+      To attach the thread is identified by target which must have the
+      value returned by gettid() (not pthread_self). For a single threaded
+      process, that value is equal to the value returned by getpid().
+
+      To detach, the special target PFM_NO_TARGET must be passed.
+
+      The flags parameter is currently unused and must be set to 0.
+
+      The session is always attached as stopped, i.e., with monitoring
+      inactive. Monitoring is always stopped as a consequence of detaching.
+
+   * int pfm_set_state(int fd, int flags, int state);
+
+     The function is used to set the running state of the session. The state to
+     go to is indicated by state.
+
+     The following states are defined, only one can be specified at a time:
+
+        - PFM_ST_START: start monitoring
+        - PFM_ST_STOP: stop monitoring
+
+      The flags parameter is currently unused and must be set to 0.
+
+   * int close(int fd)
+
+   To destroy a session, the regular close() system call is used.
+
+
+VII/ /sys interface overview
+
+   Refer to Documentation/ABI/testing/sysfs-perfmon-* for a detailed
+   description of the sysfs interface of perfmon2.
+
+VIII/ debugfs interface overview
+
+  Refer to Documentation/perfmon-debugfs.txt for a detailed description of the
+  debug and statistics interface of perfmon.
+
+IX/ Documentation
+
+   Visit http://perfmon2.sf.net
diff --git a/Makefile b/Makefile
index 7f9ff9bf1544..b14977d28eab 100644
--- a/Makefile
+++ b/Makefile
@@ -621,6 +621,8 @@ export mod_strip_cmd
 ifeq ($(KBUILD_EXTMOD),)
 core-y		+= kernel/ mm/ fs/ ipc/ security/ crypto/ block/
 
+core-$(CONFIG_PERFMON) += perfmon/
+
 vmlinux-dirs	:= $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
 		     $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
 		     $(net-y) $(net-m) $(libs-y) $(libs-m)))
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 6bd91ed7cd03..ad604df6a2b6 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -505,7 +505,7 @@ config COMPAT_FOR_U64_ALIGNMENT
 config IA64_MCA_RECOVERY
 	tristate "MCA recovery from errors other than TLB."
 
-config PERFMON
+config PERFMON_V20
 	bool "Performance monitor support"
 	help
 	  Selects whether support for the IA-64 performance monitor hardware
diff --git a/arch/ia64/configs/bigsur_defconfig b/arch/ia64/configs/bigsur_defconfig
index 6dd8655664f3..2c04fbe6c414 100644
--- a/arch/ia64/configs/bigsur_defconfig
+++ b/arch/ia64/configs/bigsur_defconfig
@@ -134,7 +134,7 @@ CONFIG_ARCH_SPARSEMEM_ENABLE=y
 CONFIG_IA32_SUPPORT=y
 CONFIG_COMPAT=y
 # CONFIG_IA64_MCA_RECOVERY is not set
-CONFIG_PERFMON=y
+CONFIG_PERFMON_V20=y
 CONFIG_IA64_PALINFO=y
 
 #
diff --git a/arch/ia64/configs/generic_defconfig b/arch/ia64/configs/generic_defconfig
index e05f9e1d3faa..7d89a19fc8b3 100644
--- a/arch/ia64/configs/generic_defconfig
+++ b/arch/ia64/configs/generic_defconfig
@@ -209,7 +209,7 @@ CONFIG_IA32_SUPPORT=y
 CONFIG_COMPAT=y
 CONFIG_COMPAT_FOR_U64_ALIGNMENT=y
 CONFIG_IA64_MCA_RECOVERY=y
-CONFIG_PERFMON=y
+CONFIG_PERFMON_V20=y
 CONFIG_IA64_PALINFO=y
 # CONFIG_IA64_MC_ERR_INJECT is not set
 CONFIG_SGI_SN=y
diff --git a/arch/ia64/configs/gensparse_defconfig b/arch/ia64/configs/gensparse_defconfig
index e86fbd39c795..5f8c7721e29a 100644
--- a/arch/ia64/configs/gensparse_defconfig
+++ b/arch/ia64/configs/gensparse_defconfig
@@ -142,7 +142,7 @@ CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
 CONFIG_IA32_SUPPORT=y
 CONFIG_COMPAT=y
 CONFIG_IA64_MCA_RECOVERY=y
-CONFIG_PERFMON=y
+CONFIG_PERFMON_V20=y
 CONFIG_IA64_PALINFO=y
 CONFIG_SGI_SN=y
 
diff --git a/arch/ia64/configs/sim_defconfig b/arch/ia64/configs/sim_defconfig
index 546a772f438e..d51457af7ca6 100644
--- a/arch/ia64/configs/sim_defconfig
+++ b/arch/ia64/configs/sim_defconfig
@@ -133,7 +133,7 @@ CONFIG_ARCH_SPARSEMEM_ENABLE=y
 CONFIG_IA32_SUPPORT=y
 CONFIG_COMPAT=y
 # CONFIG_IA64_MCA_RECOVERY is not set
-# CONFIG_PERFMON is not set
+# CONFIG_PERFMON_V20 is not set
 CONFIG_IA64_PALINFO=m
 
 #
diff --git a/arch/ia64/configs/tiger_defconfig b/arch/ia64/configs/tiger_defconfig
index c522edf23c62..318d846ab253 100644
--- a/arch/ia64/configs/tiger_defconfig
+++ b/arch/ia64/configs/tiger_defconfig
@@ -156,7 +156,7 @@ CONFIG_VIRTUAL_MEM_MAP=y
 CONFIG_HOLES_IN_ZONE=y
 # CONFIG_IA32_SUPPORT is not set
 CONFIG_IA64_MCA_RECOVERY=y
-CONFIG_PERFMON=y
+CONFIG_PERFMON_V20=y
 CONFIG_IA64_PALINFO=y
 # CONFIG_IA64_MC_ERR_INJECT is not set
 # CONFIG_IA64_ESI is not set
diff --git a/arch/ia64/configs/zx1_defconfig b/arch/ia64/configs/zx1_defconfig
index 0a06b1333c95..2bf0ad40398f 100644
--- a/arch/ia64/configs/zx1_defconfig
+++ b/arch/ia64/configs/zx1_defconfig
@@ -153,7 +153,7 @@ CONFIG_HOLES_IN_ZONE=y
 CONFIG_IA32_SUPPORT=y
 CONFIG_COMPAT=y
 CONFIG_IA64_MCA_RECOVERY=y
-CONFIG_PERFMON=y
+CONFIG_PERFMON_V20=y
 CONFIG_IA64_PALINFO=y
 # CONFIG_IA64_ESI is not set
 # CONFIG_KEXEC is not set
diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h
index f88fa054d01d..3ecf7e0b44cb 100644
--- a/arch/ia64/include/asm/processor.h
+++ b/arch/ia64/include/asm/processor.h
@@ -321,7 +321,7 @@ struct thread_struct {
 #else
 # define INIT_THREAD_IA32
 #endif /* CONFIG_IA32_SUPPORT */
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 	void *pfm_context;		     /* pointer to detailed PMU context */
 	unsigned long pfm_needs_checking;    /* when >0, pending perfmon work on kernel exit */
 # define INIT_THREAD_PM		.pfm_context =		NULL,     \
diff --git a/arch/ia64/include/asm/system.h b/arch/ia64/include/asm/system.h
index 927a381c20ca..387e54030af1 100644
--- a/arch/ia64/include/asm/system.h
+++ b/arch/ia64/include/asm/system.h
@@ -224,7 +224,7 @@ extern void ia64_account_on_switch (struct task_struct *prev, struct task_struct
 # define IA64_ACCOUNT_ON_SWITCH(p,n)
 #endif
 
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
   DECLARE_PER_CPU(unsigned long, pfm_syst_info);
 # define PERFMON_IS_SYSWIDE() (__get_cpu_var(pfm_syst_info) & 0x1)
 #else
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index c381ea954892..93819cca7d96 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -23,7 +23,7 @@ obj-$(CONFIG_IOSAPIC)		+= iosapic.o
 obj-$(CONFIG_MODULES)		+= module.o
 obj-$(CONFIG_SMP)		+= smp.o smpboot.o
 obj-$(CONFIG_NUMA)		+= numa.o
-obj-$(CONFIG_PERFMON)		+= perfmon_default_smpl.o
+obj-$(CONFIG_PERFMON_V20)	+= perfmon_default_smpl.o
 obj-$(CONFIG_IA64_CYCLONE)	+= cyclone.o
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
 obj-$(CONFIG_IA64_MCA_RECOVERY)	+= mca_recovery.o
diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c
index 28d3d483db92..db54bd497cf6 100644
--- a/arch/ia64/kernel/irq_ia64.c
+++ b/arch/ia64/kernel/irq_ia64.c
@@ -40,7 +40,7 @@
 #include <asm/system.h>
 #include <asm/tlbflush.h>
 
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 # include <asm/perfmon.h>
 #endif
 
@@ -660,7 +660,7 @@ init_IRQ (void)
 	}
 #endif
 #endif
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 	pfm_init_percpu();
 #endif
 	platform_irq_init();
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 0e499757309b..5f6efcfa2de4 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -52,7 +52,7 @@
 #include <asm/uaccess.h>
 #include <asm/delay.h>
 
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 /*
  * perfmon context state
  */
@@ -6831,10 +6831,10 @@ pfm_inherit(struct task_struct *task, struct pt_regs *regs)
 	 * the psr bits are already set properly in copy_threads()
 	 */
 }
-#else  /* !CONFIG_PERFMON */
+#else  /* !CONFIG_PERFMON_v20 */
 asmlinkage long
 sys_perfmonctl (int fd, int cmd, void *arg, int count)
 {
 	return -ENOSYS;
 }
-#endif /* CONFIG_PERFMON */
+#endif /* CONFIG_PERFMON_V20 */
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index c57162705147..afbf1a8205ee 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -46,7 +46,7 @@
 
 #include "entry.h"
 
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 # include <asm/perfmon.h>
 #endif
 
@@ -174,7 +174,7 @@ do_notify_resume_user(sigset_t *unused, struct sigscratch *scr, long in_syscall)
 		return;
 	}
 
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 	if (current->thread.pfm_needs_checking)
 		/*
 		 * Note: pfm_handle_work() allow us to call it with interrupts
@@ -334,14 +334,14 @@ cpu_idle (void)
 void
 ia64_save_extra (struct task_struct *task)
 {
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 	unsigned long info;
 #endif
 
 	if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0)
 		ia64_save_debug_regs(&task->thread.dbr[0]);
 
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 	if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0)
 		pfm_save_regs(task);
 
@@ -359,14 +359,14 @@ ia64_save_extra (struct task_struct *task)
 void
 ia64_load_extra (struct task_struct *task)
 {
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 	unsigned long info;
 #endif
 
 	if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0)
 		ia64_load_debug_regs(&task->thread.dbr[0]);
 
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 	if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0)
 		pfm_load_regs(task);
 
@@ -523,7 +523,7 @@ copy_thread (int nr, unsigned long clone_flags,
 	}
 #endif
 
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 	if (current->thread.pfm_context)
 		pfm_inherit(p, child_ptregs);
 #endif
@@ -735,7 +735,7 @@ exit_thread (void)
 {
 
 	ia64_drop_fpu(current);
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
        /* if needed, stop monitoring and flush state to perfmon context */
 	if (current->thread.pfm_context)
 		pfm_exit_thread(current);
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 92c9689b7d97..ffd212fd2d36 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -31,7 +31,7 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/unwind.h>
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 #include <asm/perfmon.h>
 #endif
 
@@ -2105,7 +2105,7 @@ access_uarea(struct task_struct *child, unsigned long addr,
 				"address 0x%lx\n", addr);
 		return -1;
 	}
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 	/*
 	 * Check if debug registers are used by perfmon. This
 	 * test must be done once we know that we can do the
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 1dcbb85fc4ee..f865315a9248 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -381,7 +381,7 @@ smp_callin (void)
 	extern void ia64_init_itm(void);
 	extern volatile int time_keeper_id;
 
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 	extern void pfm_init_percpu(void);
 #endif
 
@@ -411,7 +411,7 @@ smp_callin (void)
 
 	ia64_mca_cmc_vector_setup();	/* Setup vector on AP */
 
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 	pfm_init_percpu();
 #endif
 
diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile
index 98771e2a78af..754f4153123e 100644
--- a/arch/ia64/lib/Makefile
+++ b/arch/ia64/lib/Makefile
@@ -13,7 +13,7 @@ lib-y := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o			\
 
 obj-$(CONFIG_ITANIUM)	+= copy_page.o copy_user.o memcpy.o
 obj-$(CONFIG_MCKINLEY)	+= copy_page_mck.o memcpy_mck.o
-lib-$(CONFIG_PERFMON)	+= carta_random.o
+lib-$(CONFIG_PERFMON_V20) += carta_random.o
 
 AFLAGS___divdi3.o	=
 AFLAGS___udivdi3.o	= -DUNSIGNED
diff --git a/arch/ia64/oprofile/Makefile b/arch/ia64/oprofile/Makefile
index aad27a718ee0..3323fd5a46e9 100644
--- a/arch/ia64/oprofile/Makefile
+++ b/arch/ia64/oprofile/Makefile
@@ -7,4 +7,4 @@ DRIVER_OBJS := $(addprefix ../../../drivers/oprofile/, \
 		timer_int.o )
 
 oprofile-y := $(DRIVER_OBJS) init.o backtrace.o
-oprofile-$(CONFIG_PERFMON) += perfmon.o
+oprofile-$(CONFIG_PERFMON_V20) += perfmon.o
diff --git a/arch/ia64/oprofile/init.c b/arch/ia64/oprofile/init.c
index 31b545c35460..9ed2bc152fba 100644
--- a/arch/ia64/oprofile/init.c
+++ b/arch/ia64/oprofile/init.c
@@ -20,7 +20,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops)
 {
 	int ret = -ENODEV;
 
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 	/* perfmon_init() can fail, but we have no way to report it */
 	ret = perfmon_init(ops);
 #endif
@@ -32,7 +32,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops)
 
 void oprofile_arch_exit(void)
 {
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
 	perfmon_exit();
 #endif
 }
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b5e714373385..cdc53491c033 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1516,6 +1516,8 @@ config CMDLINE_OVERRIDE
 	  This is used to work around broken boot loaders.  This should
 	  be set to 'N' under normal conditions.
 
+source "arch/x86/perfmon/Kconfig"
+
 endmenu
 
 config ARCH_ENABLE_MEMORY_HOTPLUG
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index cf72b569db41..f3af2b0b4f15 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -155,6 +155,9 @@ core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
 core-y += arch/x86/kernel/
 core-y += arch/x86/mm/
 
+# perfmon support
+core-$(CONFIG_PERFMON) += arch/x86/perfmon/
+
 # Remaining sub architecture files
 core-y += $(mcore-y)
 
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 256b00b61892..891af3e6b3a6 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -826,4 +826,9 @@ ia32_sys_call_table:
 	.quad sys_dup3			/* 330 */
 	.quad sys_pipe2
 	.quad sys_inotify_init1
+	.quad sys_pfm_create
+	.quad sys_pfm_write
+	.quad sys_pfm_read		/* 335 */
+	.quad sys_pfm_attach
+	.quad sys_pfm_set_state
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 4a8e80cdcfa5..15d495f73485 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -10,6 +10,7 @@ header-y += ptrace-abi.h
 header-y += sigcontext32.h
 header-y += ucontext.h
 header-y += processor-flags.h
+header-y += perfmon.h
 
 unifdef-y += e820.h
 unifdef-y += ist.h
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 0005adb0f941..0ba6dd3aa24e 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -87,6 +87,11 @@
 #define LOCAL_TIMER_VECTOR	0xef
 
 /*
+ * Perfmon PMU interrupt vector
+ */
+#define LOCAL_PERFMON_VECTOR	0xee
+
+/*
  * First APIC vector available to drivers: (vectors 0x30-0xee) we
  * start at 0x31(0x41) to spread out vectors evenly between priority
  * levels. (0x80 is the syscall vector)
diff --git a/arch/x86/include/asm/mach-default/entry_arch.h b/arch/x86/include/asm/mach-default/entry_arch.h
index 6b1add8e31dd..e940722dc1f0 100644
--- a/arch/x86/include/asm/mach-default/entry_arch.h
+++ b/arch/x86/include/asm/mach-default/entry_arch.h
@@ -33,4 +33,8 @@ BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
 BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
 #endif
 
+#ifdef CONFIG_PERFMON
+BUILD_INTERRUPT(pmu_interrupt,LOCAL_PERFMON_VECTOR)
+#endif
+
 #endif
diff --git a/arch/x86/include/asm/perfmon.h b/arch/x86/include/asm/perfmon.h
new file mode 100644
index 000000000000..906f4b24cf0c
--- /dev/null
+++ b/arch/x86/include/asm/perfmon.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This file contains i386/x86_64 specific definitions for the perfmon
+ * interface.
+ *
+ * This file MUST never be included directly. Use linux/perfmon.h.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#ifndef _ASM_X86_PERFMON__H_
+#define _ASM_X86_PERFMON__H_
+
+/*
+ * arch-specific user visible interface definitions
+ */
+
+#define PFM_ARCH_MAX_PMCS	(256+64) /* 256 HW 64 SW */
+#define PFM_ARCH_MAX_PMDS	(256+64) /* 256 HW 64 SW */
+
+#endif /* _ASM_X86_PERFMON_H_ */
diff --git a/arch/x86/include/asm/perfmon_kern.h b/arch/x86/include/asm/perfmon_kern.h
new file mode 100644
index 000000000000..7cadbb894e83
--- /dev/null
+++ b/arch/x86/include/asm/perfmon_kern.h
@@ -0,0 +1,438 @@
+/*
+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Copyright (c) 2007 Advanced Micro Devices, Inc.
+ * Contributed by Robert Richter <robert.richter@amd.com>
+ *
+ * This file contains X86 Processor Family specific definitions
+ * for the perfmon interface. This covers P6, Pentium M, P4/Xeon
+ * (32-bit and 64-bit, i.e., EM64T) and AMD X86-64.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#ifndef _ASM_X86_PERFMON_KERN_H_
+#define _ASM_X86_PERFMON_KERN_H_
+
+#ifdef CONFIG_PERFMON
+#include <linux/unistd.h>
+#ifdef CONFIG_4KSTACKS
+#define PFM_ARCH_STK_ARG	8
+#else
+#define PFM_ARCH_STK_ARG	16
+#endif
+
+struct pfm_arch_pmu_info {
+	u32 flags;		/* PMU feature flags */
+	/*
+	 * mandatory model-specific callbacks
+	 */
+	int  (*stop_save)(struct pfm_context *ctx, struct pfm_event_set *set);
+	int  (*has_ovfls)(struct pfm_context *ctx);
+	void (*quiesce)(void);
+
+	/*
+	 * optional model-specific callbacks
+	 */
+	void (*acquire_pmu_percpu)(void);
+	void (*release_pmu_percpu)(void);
+	int (*load_context)(struct pfm_context *ctx);
+	void (*unload_context)(struct pfm_context *ctx);
+};
+
+/*
+ * PMU feature flags
+ */
+#define PFM_X86_FL_NO_SHARING	0x02	/* no sharing with other subsystems */
+#define PFM_X86_FL_SHARING	0x04	/* PMU is being shared */
+
+struct pfm_x86_ctx_flags {
+	unsigned int insecure:1;  /* rdpmc per-thread self-monitoring */
+	unsigned int reserved:31; /* for future use */
+};
+
+struct pfm_arch_context {
+	u64 saved_real_iip;		/* instr pointer of last NMI intr */
+	struct pfm_x86_ctx_flags flags;	/* flags */
+	int saved_started;
+};
+
+/*
+ * functions implemented as inline on x86
+ */
+
+/**
+ * pfm_arch_write_pmc - write a single PMC register
+ * @ctx: context to work on
+ * @cnum: PMC index
+ * @value: PMC 64-bit value
+ *
+ * in certain situations, ctx may be NULL
+ */
+static inline void pfm_arch_write_pmc(struct pfm_context *ctx,
+				      unsigned int cnum, u64 value)
+{
+	/*
+	 * we only write to the actual register when monitoring is
+	 * active (pfm_start was issued)
+	 */
+	if (ctx && ctx->flags.started == 0)
+		return;
+
+	PFM_DBG_ovfl("pfm_arch_write_pmc(0x%lx, 0x%Lx)",
+		     pfm_pmu_conf->pmc_desc[cnum].hw_addr,
+		     (unsigned long long) value);
+
+	wrmsrl(pfm_pmu_conf->pmc_desc[cnum].hw_addr, value);
+}
+
+/**
+ * pfm_arch_write_pmd - write a single PMD register
+ * @ctx: context to work on
+ * @cnum: PMD index
+ * @value: PMD 64-bit value
+ */
+static inline void pfm_arch_write_pmd(struct pfm_context *ctx,
+				      unsigned int cnum, u64 value)
+{
+	/*
+	 * to make sure the counter overflows, we set the
+	 * upper bits. we also clear any other unimplemented
+	 * bits as this may cause crash on some processors.
+	 */
+	if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_C64)
+		value = (value | ~pfm_pmu_conf->ovfl_mask)
+		      & ~pfm_pmu_conf->pmd_desc[cnum].rsvd_msk;
+
+	PFM_DBG_ovfl("pfm_arch_write_pmd(0x%lx, 0x%Lx)",
+		     pfm_pmu_conf->pmd_desc[cnum].hw_addr,
+		     (unsigned long long) value);
+
+	wrmsrl(pfm_pmu_conf->pmd_desc[cnum].hw_addr, value);
+}
+
+/**
+ * pfm_arch_read_pmd - read a single PMD register
+ * @ctx: context to work on
+ * @cnum: PMD index
+ *
+ * return value is register 64-bit value
+ */
+static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum)
+{
+	u64 tmp;
+
+	rdmsrl(pfm_pmu_conf->pmd_desc[cnum].hw_addr, tmp);
+
+	PFM_DBG_ovfl("pfm_arch_read_pmd(0x%lx) = 0x%Lx",
+		     pfm_pmu_conf->pmd_desc[cnum].hw_addr,
+		     (unsigned long long) tmp);
+	return tmp;
+}
+
+/**
+ * pfm_arch_read_pmc - read a single PMC register
+ * @ctx: context to work on
+ * @cnum: PMC index
+ *
+ * return value is register 64-bit value
+ */
+static inline u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum)
+{
+	u64 tmp;
+
+	rdmsrl(pfm_pmu_conf->pmc_desc[cnum].hw_addr, tmp);
+
+	PFM_DBG_ovfl("pfm_arch_read_pmc(0x%lx) = 0x%016Lx",
+		     pfm_pmu_conf->pmc_desc[cnum].hw_addr,
+		     (unsigned long long) tmp);
+	return tmp;
+}
+
+/**
+ * pfm_arch_is_active - return non-zero is monitoring has been started
+ * @ctx: context to check
+ *
+ * At certain points, perfmon needs to know if monitoring has been
+ * explicitly started.
+ *
+ * On x86, there is not other way but to use pfm_start/pfm_stop
+ * to activate monitoring, thus we can simply check flags.started
+ */
+static inline int pfm_arch_is_active(struct pfm_context *ctx)
+{
+	return ctx->flags.started;
+}
+
+
+/**
+ * pfm_arch_unload_context - detach context from thread or CPU
+ * @ctx: context to detach
+ *
+ * in system-wide ctx->task is NULL, otherwise it points to the
+ * attached thread
+ */
+static inline void pfm_arch_unload_context(struct pfm_context *ctx)
+{
+	struct pfm_arch_pmu_info *pmu_info;
+	struct pfm_arch_context *ctx_arch;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+	pmu_info = pfm_pmu_info();
+
+	if (ctx_arch->flags.insecure) {
+		PFM_DBG("clear cr4.pce");
+		clear_in_cr4(X86_CR4_PCE);
+	}
+
+	if (pmu_info->unload_context)
+		pmu_info->unload_context(ctx);
+}
+
+/**
+ * pfm_arch_load_context - attach context to thread or CPU
+ * @ctx: context to attach
+ */
+static inline int pfm_arch_load_context(struct pfm_context *ctx)
+{
+	struct pfm_arch_pmu_info *pmu_info;
+	struct pfm_arch_context *ctx_arch;
+	int ret = 0;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+	pmu_info = pfm_pmu_info();
+
+	/*
+	 * RDPMC authorized in system-wide and
+	 * per-thread self-monitoring.
+	 *
+	 * RDPMC only gives access to counts.
+	 *
+	 * The context-switch routine code does not restore
+	 * all the PMD registers (optimization), thus there
+	 * is a possible leak of counts there in per-thread
+	 * mode.
+	 */
+	if (ctx->task == current) {
+		PFM_DBG("set cr4.pce");
+		set_in_cr4(X86_CR4_PCE);
+		ctx_arch->flags.insecure = 1;
+	}
+
+	if (pmu_info->load_context)
+		ret = pmu_info->load_context(ctx);
+
+	return ret;
+}
+
+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set);
+void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx);
+void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx);
+
+/**
+ * pfm_arch_intr_freeze_pmu - stop monitoring when handling PMU interrupt
+ * @ctx: current context
+ * @set: current event set
+ *
+ * called from __pfm_interrupt_handler().
+ * ctx is not NULL. ctx is locked. interrupts are masked
+ *
+ * The following actions must take place:
+ *  - stop all monitoring to ensure handler has consistent view.
+ *  - collect overflowed PMDs bitmask into povfls_pmds and
+ *    npend_ovfls. If no interrupt detected then npend_ovfls
+ *    must be set to zero.
+ */
+static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx,
+					    struct pfm_event_set *set)
+{
+	struct pfm_arch_context *ctx_arch;
+	ctx_arch = pfm_ctx_arch(ctx);
+	/*
+	 * on X86, freezing is equivalent to stopping
+	 */
+	pfm_arch_stop(current, ctx);
+
+	/*
+	 * we mark monitoring as stopped to avoid
+	 * certain side effects especially in
+	 * pfm_arch_restore_pmcs()
+	 */
+	ctx_arch->saved_started = ctx->flags.started;
+	ctx->flags.started = 0;
+}
+
+/**
+ * pfm_arch_intr_unfreeze_pmu - conditionally reactive monitoring
+ * @ctx: current context
+ *
+ * current context may be not when dealing when spurious interrupts
+ *
+ * Must re-activate monitoring if context is not MASKED.
+ * interrupts are masked.
+ */
+static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx)
+{
+	struct pfm_arch_context *ctx_arch;
+
+	if (ctx == NULL)
+		return;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+
+	PFM_DBG_ovfl("state=%d", ctx->state);
+
+	/*
+	 * restore flags.started which is cleared in
+	 * pfm_arch_intr_freeze_pmu()
+	 */
+	ctx->flags.started = ctx_arch->saved_started;
+
+	pfm_arch_restore_pmcs(ctx, ctx->active_set);
+}
+
+/**
+ * pfm_arch_ovfl_reset_pmd - reset pmd on overflow
+ * @ctx: current context
+ * @cnum: PMD index
+ *
+ * On some CPUs, the upper bits of a counter must be set in order for the
+ * overflow interrupt to happen. On overflow, the counter has wrapped around,
+ * and the upper bits are cleared. This function may be used to set them back.
+ *
+ * For x86, the current version loses whatever is remaining in the counter,
+ * which is usually has a small count. In order not to loose this count,
+ * we do a read-modify-write to set the upper bits while preserving the
+ * low-order bits. This is slow but works.
+ */
+static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx, unsigned int cnum)
+{
+	u64 val;
+	val = pfm_arch_read_pmd(ctx, cnum);
+	pfm_arch_write_pmd(ctx, cnum, val);
+}
+
+/**
+ * pfm_arch_context_create - create context
+ * @ctx: newly created context
+ * @flags: context flags as passed by user
+ *
+ * called from __pfm_create_context()
+ */
+static inline int pfm_arch_context_create(struct pfm_context *ctx, u32 ctx_flags)
+{
+	return 0;
+}
+
+/**
+ * pfm_arch_context_free - free context
+ * @ctx: context to free
+ */
+static inline void pfm_arch_context_free(struct pfm_context *ctx)
+{}
+
+/*
+ * functions implemented in arch/x86/perfmon/perfmon.c
+ */
+int  pfm_arch_init(void);
+void pfm_arch_resend_irq(struct pfm_context *ctx);
+
+int  pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx);
+void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx);
+
+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set);
+int  pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg);
+void pfm_arch_pmu_config_remove(void);
+char *pfm_arch_get_pmu_module_name(void);
+int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds);
+void pfm_arch_pmu_release(void);
+
+static inline void pfm_arch_serialize(void)
+{}
+
+static inline void pfm_arch_arm_handle_work(struct task_struct *task)
+{}
+
+static inline void pfm_arch_disarm_handle_work(struct task_struct *task)
+{}
+
+#define PFM_ARCH_CTX_SIZE	(sizeof(struct pfm_arch_context))
+/*
+ * x86 does not need extra alignment requirements for the sampling buffer
+ */
+#define PFM_ARCH_SMPL_ALIGN_SIZE	0
+
+asmlinkage void  pmu_interrupt(void);
+
+static inline void pfm_arch_bv_copy(u64 *a, u64 *b, int nbits)
+{
+	bitmap_copy((unsigned long *)a,
+		    (unsigned long *)b,
+		    nbits);
+}
+
+static inline void pfm_arch_bv_or(u64 *a, u64 *b, u64 *c, int nbits)
+{
+	bitmap_or((unsigned long *)a,
+		  (unsigned long *)b,
+		  (unsigned long *)c,
+		  nbits);
+}
+
+static inline void pfm_arch_bv_and(u64 *a, u64 *b, u64 *c, int nbits)
+{
+	bitmap_and((unsigned long *)a,
+		  (unsigned long *)b,
+		  (unsigned long *)c,
+		  nbits);
+}
+
+
+static inline void pfm_arch_bv_zero(u64 *a, int nbits)
+{
+	bitmap_zero((unsigned long *)a, nbits);
+}
+
+static inline int pfm_arch_bv_weight(u64 *a, int nbits)
+{
+	return bitmap_weight((unsigned long *)a, nbits);
+}
+
+static inline void pfm_arch_bv_set_bit(int b, u64 *a)
+{
+	__set_bit(b, (unsigned long *)a);
+}
+
+static inline void pfm_arch_bv_clear_bit(int b, u64 *a)
+{
+	__clear_bit(b, (unsigned long *)a);
+}
+
+static inline int pfm_arch_bv_test_bit(int b, u64 *a)
+{
+	return test_bit(b, (unsigned long *)a);
+}
+
+static inline unsigned long pfm_arch_bv_find_next_bit(const u64 *addr,
+						      unsigned long size,
+						      unsigned long offset)
+{
+	return find_next_bit((unsigned long *)addr,
+			     size,
+			     offset);
+}
+#endif /* CONFIG_PEFMON */
+
+#endif /* _ASM_X86_PERFMON_KERN_H_ */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e44d379faad2..0ddd534bef44 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -79,6 +79,7 @@ struct thread_info {
 #define TIF_SYSCALL_EMU		6	/* syscall emulation active */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
+#define TIF_PERFMON_WORK	9	/* work for pfm_handle_work() */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* 32bit process */
@@ -92,6 +93,7 @@ struct thread_info {
 #define TIF_DEBUGCTLMSR		25	/* uses thread_struct.debugctlmsr */
 #define TIF_DS_AREA_MSR		26      /* uses thread_struct.ds_area_msr */
 #define TIF_BTS_TRACE_TS	27      /* record scheduling event timestamps */
+#define TIF_PERFMON_CTXSW	28	/* perfmon needs ctxsw calls */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -114,6 +116,8 @@ struct thread_info {
 #define _TIF_DEBUGCTLMSR	(1 << TIF_DEBUGCTLMSR)
 #define _TIF_DS_AREA_MSR	(1 << TIF_DS_AREA_MSR)
 #define _TIF_BTS_TRACE_TS	(1 << TIF_BTS_TRACE_TS)
+#define _TIF_PERFMON_WORK	(1<<TIF_PERFMON_WORK)
+#define _TIF_PERFMON_CTXSW	(1<<TIF_PERFMON_CTXSW)
 
 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY	\
@@ -135,12 +139,12 @@ struct thread_info {
 
 /* Only used for 64 bit */
 #define _TIF_DO_NOTIFY_MASK						\
-	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
+	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERFMON_WORK)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
 	(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \
-								_TIF_NOTSC)
+	 _TIF_NOTSC|_TIF_PERFMON_CTXSW)
 
 #define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index f2bba78430a4..06908451002f 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -338,6 +338,11 @@
 #define __NR_dup3		330
 #define __NR_pipe2		331
 #define __NR_inotify_init1	332
+#define __NR_pfm_create		333
+#define __NR_pfm_write		(__NR_pfm_create+1)
+#define __NR_pfm_read		(__NR_pfm_create+2)
+#define __NR_pfm_attach		(__NR_pfm_create+3)
+#define __NR_pfm_set_state	(__NR_pfm_create+4)
 
 #ifdef __KERNEL__
 
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 834b2c1d89fb..a42bb5eb9edb 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -653,7 +653,16 @@ __SYSCALL(__NR_dup3, sys_dup3)
 __SYSCALL(__NR_pipe2, sys_pipe2)
 #define __NR_inotify_init1			294
 __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
-
+#define __NR_pfm_create				295
+__SYSCALL(__NR_pfm_create, sys_pfm_create)
+#define __NR_pfm_write				(__NR_pfm_create+1)
+__SYSCALL(__NR_pfm_write, sys_pfm_write)
+#define __NR_pfm_read				(__NR_pfm_create+2)
+ __SYSCALL(__NR_pfm_read, sys_pfm_read)
+#define __NR_pfm_attach				(__NR_pfm_create+3)
+__SYSCALL(__NR_pfm_attach, sys_pfm_attach)
+#define __NR_pfm_set_state			(__NR_pfm_create+4)
+__SYSCALL(__NR_pfm_set_state, sys_pfm_set_state)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 9134de814c97..9f8826f33032 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -513,7 +513,7 @@ ENDPROC(system_call)
 	ALIGN
 	RING0_PTREGS_FRAME		# can't unwind into user space anyway
 work_pending:
-	testb $_TIF_NEED_RESCHED, %cl
+	testw $(_TIF_NEED_RESCHED|_TIF_PERFMON_WORK), %cx
 	jz work_notifysig
 work_resched:
 	call schedule
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 983d85aeccce..1d9bef0797d9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -876,7 +876,13 @@ END(error_interrupt)
 ENTRY(spurious_interrupt)
 	apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
 END(spurious_interrupt)
-				
+
+#ifdef CONFIG_PERFMON
+ENTRY(pmu_interrupt)
+	apicinterrupt LOCAL_PERFMON_VECTOR,smp_pmu_interrupt
+END(pmu_interrupt)
+#endif
+
 /*
  * Exception entry points.
  */ 		
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index ff0235391285..24a0140e6c36 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -11,6 +11,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/sysdev.h>
 #include <linux/bitops.h>
+#include <linux/perfmon_kern.h>
 
 #include <asm/acpi.h>
 #include <asm/atomic.h>
@@ -224,6 +225,10 @@ void __init native_init_IRQ(void)
 
 	apic_intr_init();
 
+#ifdef CONFIG_PERFMON
+	alloc_intr_gate(LOCAL_PERFMON_VECTOR, pmu_interrupt);
+#endif
+
 	if (!acpi_ioapic)
 		setup_irq(2, &irq2);
 }
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0a1302fe6d45..7ff71d4d6d9b 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -36,6 +36,7 @@
 #include <linux/personality.h>
 #include <linux/tick.h>
 #include <linux/percpu.h>
+#include <linux/perfmon_kern.h>
 #include <linux/prctl.h>
 #include <linux/dmi.h>
 
@@ -258,6 +259,7 @@ void exit_thread(void)
 		ds_free(current->thread.ds_ctx);
 	}
 #endif /* CONFIG_X86_DS */
+	pfm_exit_thread();
 }
 
 void flush_thread(void)
@@ -315,6 +317,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 
 	savesegment(gs, p->thread.gs);
 
+	pfm_copy_thread(p);
+
 	tsk = current;
 	if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
@@ -458,11 +462,17 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 	prev = &prev_p->thread;
 	next = &next_p->thread;
 
+ 	if (test_tsk_thread_flag(prev_p, TIF_PERFMON_CTXSW))
+ 		pfm_ctxsw_out(prev_p, next_p);
+
 	debugctl = update_debugctl(prev, next, prev->debugctlmsr);
 
 	if (next->debugctlmsr != debugctl)
 		update_debugctlmsr(next->debugctlmsr);
 
+	if (test_tsk_thread_flag(next_p, TIF_PERFMON_CTXSW))
+		pfm_ctxsw_in(prev_p, next_p);
+
 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
 		set_debugreg(next->debugreg0, 0);
 		set_debugreg(next->debugreg1, 1);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3180e79c3697..86099f98104a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -37,6 +37,7 @@
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
 #include <linux/tick.h>
+#include <linux/perfmon_kern.h>
 #include <linux/prctl.h>
 #include <linux/uaccess.h>
 #include <linux/io.h>
@@ -255,6 +256,7 @@ void exit_thread(void)
 		ds_free(t->ds_ctx);
 	}
 #endif /* CONFIG_X86_DS */
+	pfm_exit_thread();
 }
 
 void flush_thread(void)
@@ -359,6 +361,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 	savesegment(es, p->thread.es);
 	savesegment(ds, p->thread.ds);
 
+	pfm_copy_thread(p);
+
 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
 		if (!p->thread.io_bitmap_ptr) {
@@ -487,6 +491,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
 	prev = &prev_p->thread,
 	next = &next_p->thread;
 
+	if (test_tsk_thread_flag(prev_p, TIF_PERFMON_CTXSW))
+		pfm_ctxsw_out(prev_p, next_p);
+
 	debugctl = prev->debugctlmsr;
 
 #ifdef CONFIG_X86_DS
@@ -513,6 +520,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
 	if (next->debugctlmsr != debugctl)
 		update_debugctlmsr(next->debugctlmsr);
 
+	if (test_tsk_thread_flag(next_p, TIF_PERFMON_CTXSW))
+		pfm_ctxsw_in(prev_p, next_p);
+
 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
 		loaddebug(next, 0);
 		loaddebug(next, 1);
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 27a5c8174322..7d6fc603dea7 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -19,6 +19,7 @@
 #include <linux/wait.h>
 #include <linux/tracehook.h>
 #include <linux/elf.h>
+#include <linux/perfmon_kern.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
 
@@ -749,6 +750,10 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 		mce_notify_user();
 #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
 
+  	/* process perfmon asynchronous work (e.g. block thread or reset) */
+  	if (thread_info_flags & _TIF_PERFMON_WORK)
+  		pfm_handle_work(regs);
+
 	/* deal with pending signal delivery */
 	if (thread_info_flags & _TIF_SIGPENDING)
 		do_signal(regs);
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index d2307e41fbdb..24e389836fc0 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -21,6 +21,7 @@
 #include <linux/personality.h>
 #include <linux/compiler.h>
 #include <linux/uaccess.h>
+#include <linux/perfmon_kern.h>
 
 #include <asm/processor.h>
 #include <asm/ucontext.h>
@@ -538,6 +539,10 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 		mce_notify_user();
 #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
 
+ 	/* process perfmon asynchronous work (e.g. block thread or reset) */
+ 	if (thread_info_flags & _TIF_PERFMON_WORK)
+ 		pfm_handle_work(regs);
+
 	/* deal with pending signal delivery */
 	if (thread_info_flags & _TIF_SIGPENDING)
 		do_signal(regs);
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d44395ff34c3..81c22739f70b 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -332,3 +332,8 @@ ENTRY(sys_call_table)
 	.long sys_dup3			/* 330 */
 	.long sys_pipe2
 	.long sys_inotify_init1
+	.long sys_pfm_create
+	.long sys_pfm_write
+	.long sys_pfm_read		/* 335 */
+	.long sys_pfm_attach
+	.long sys_pfm_set_state
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 022cd41ea9b4..584a9ef4e44c 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -17,6 +17,7 @@
 #include <linux/moduleparam.h>
 #include <linux/kdebug.h>
 #include <linux/cpu.h>
+#include <linux/perfmon_kern.h>
 #include <asm/nmi.h>
 #include <asm/msr.h>
 #include <asm/apic.h>
@@ -142,12 +143,18 @@ static int nmi_setup(void)
 	int err = 0;
 	int cpu;
 
-	if (!allocate_msrs())
+	if (pfm_session_allcpus_acquire())
+		return -EBUSY;
+
+	if (!allocate_msrs()) {
+		pfm_session_allcpus_release();
 		return -ENOMEM;
+	}
 
 	err = register_die_notifier(&profile_exceptions_nb);
 	if (err) {
 		free_msrs();
+		pfm_session_allcpus_release();
 		return err;
 	}
 
@@ -228,6 +235,7 @@ static void nmi_shutdown(void)
 	msrs = &get_cpu_var(cpu_msrs);
 	model->shutdown(msrs);
 	free_msrs();
+	pfm_session_allcpus_release();
 	put_cpu_var(cpu_msrs);
 }
 
diff --git a/arch/x86/perfmon/Kconfig b/arch/x86/perfmon/Kconfig
new file mode 100644
index 000000000000..8144d1d0d600
--- /dev/null
+++ b/arch/x86/perfmon/Kconfig
@@ -0,0 +1,33 @@
+menu "Hardware Performance Monitoring support"
+config PERFMON
+	bool "Perfmon2 performance monitoring interface"
+	select X86_LOCAL_APIC
+	default n
+	help
+	Enables the perfmon2 interface to access the hardware
+	performance counters. See <http://perfmon2.sf.net/> for
+	more details.
+
+config PERFMON_DEBUG
+	bool "Perfmon debugging"
+	default n
+	depends on PERFMON
+	help
+	Enables perfmon debugging support
+
+config  X86_PERFMON_INTEL_ARCH
+	bool "Support for Intel architectural perfmon v1/v2/v3"
+	depends on PERFMON
+	default n
+	help
+	Enables support for Intel architectural performance counters.
+	This feature was introduced with Intel Core Solo/Core Duo processors.
+
+config	X86_PERFMON_AMD64
+	bool "Support AMD Athlon/Opteron hardware performance counters"
+	depends on PERFMON
+	default n
+	help
+	Enables support for Athlon/Opterton hardware performance counters.
+	Support for  family 6, 15 and 16 processors.
+ endmenu
diff --git a/arch/x86/perfmon/Makefile b/arch/x86/perfmon/Makefile
new file mode 100644
index 000000000000..c0a4ca0da329
--- /dev/null
+++ b/arch/x86/perfmon/Makefile
@@ -0,0 +1,7 @@
+#
+# Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P.
+# Contributed by Stephane Eranian <eranian@hpl.hp.com>
+#
+obj-$(CONFIG_PERFMON)			+= perfmon.o
+obj-$(CONFIG_X86_PERFMON_INTEL_ARCH)	+= perfmon_intel_arch.o
+obj-$(CONFIG_X86_PERFMON_AMD64)		+= perfmon_amd64.o
diff --git a/arch/x86/perfmon/perfmon.c b/arch/x86/perfmon/perfmon.c
new file mode 100644
index 000000000000..844f19dc6cb0
--- /dev/null
+++ b/arch/x86/perfmon/perfmon.c
@@ -0,0 +1,619 @@
+/*
+ * This file implements the X86 specific support for the perfmon2 interface
+ *
+ * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Copyright (c) 2007 Advanced Micro Devices, Inc.
+ * Contributed by Robert Richter <robert.richter@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/interrupt.h>
+#include <linux/perfmon_kern.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/nmi.h>
+
+#include <asm/apic.h>
+
+DEFINE_PER_CPU(unsigned long, real_iip);
+DEFINE_PER_CPU(int, pfm_using_nmi);
+
+/**
+ * pfm_arch_ctxswin_thread - thread context switch in
+ * @task: task switched in
+ * @ctx: context for the task
+ * @set: active event set
+ *
+ * Called from pfm_ctxsw(). Task is guaranteed to be current.
+ * set cannot be NULL. Context is locked. Interrupts are masked.
+ *
+ * Caller has already restored all PMD and PMC registers, if
+ * necessary (i.e., lazy restore scheme).
+ *
+ * On x86, the only common code just needs to unsecure RDPMC if necessary
+ *
+ * On model-specific features, e.g., PEBS, IBS, are taken care of in the
+ * corresponding PMU description module
+ */
+void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx)
+{
+	struct pfm_arch_context *ctx_arch;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+
+	/*
+	 *  restore saved real iip
+	 */
+	if (ctx->active_set->npend_ovfls)
+		__get_cpu_var(real_iip) = ctx_arch->saved_real_iip;
+
+	/*
+	 * enable RDPMC on this CPU
+	 */
+	if (ctx_arch->flags.insecure)
+		set_in_cr4(X86_CR4_PCE);
+}
+
+/**
+ * pfm_arch_ctxswout_thread - context switch out thread
+ * @task: task switched out
+ * @ctx : context switched out
+ *
+ * Called from pfm_ctxsw(). Task is guaranteed to be current.
+ * Context is locked. Interrupts are masked. Monitoring may be active.
+ * PMU access is guaranteed. PMC and PMD registers are live in PMU.
+ *
+ * Return:
+ * 	non-zero : did not save PMDs (as part of stopping the PMU)
+ * 	       0 : saved PMDs (no need to save them in caller)
+ */
+int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx)
+{
+	struct pfm_arch_context *ctx_arch;
+	struct pfm_arch_pmu_info *pmu_info;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+	pmu_info = pfm_pmu_info();
+
+	/*
+	 * disable lazy restore of PMCS on ctxswin because
+	 * we modify some of them.
+	 */
+	ctx->active_set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS;
+
+	if (ctx->active_set->npend_ovfls)
+		ctx_arch->saved_real_iip = __get_cpu_var(real_iip);
+
+	/*
+	 * disable RDPMC on this CPU
+	 */
+	if (ctx_arch->flags.insecure)
+		clear_in_cr4(X86_CR4_PCE);
+
+	return pmu_info->stop_save(ctx, ctx->active_set);
+}
+
+/**
+ * pfm_arch_stop - deactivate monitoring
+ * @task: task to stop
+ * @ctx: context to stop
+ *
+ * Called from pfm_stop()
+ * Interrupts are masked. Context is locked. Set is the active set.
+ *
+ * For per-thread:
+ *   task is not necessarily current. If not current task, then
+ *   task is guaranteed stopped and off any cpu. Access to PMU
+ *   is not guaranteed.
+ *
+ * must disable active monitoring. ctx cannot be NULL
+ */
+void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx)
+{
+	struct pfm_arch_pmu_info *pmu_info;
+
+	pmu_info = pfm_pmu_info();
+
+	/*
+	 * no need to go through stop_save()
+	 * if we are already stopped
+	 */
+	if (!ctx->flags.started)
+		return;
+
+	if (task != current)
+		return;
+
+	pmu_info->stop_save(ctx, ctx->active_set);
+}
+
+
+/**
+ * pfm_arch_start - activate monitoring
+ * @task: task to start
+ * @ctx: context to stop
+ *
+ * Interrupts are masked. Context is locked.
+ *
+ * For per-thread:
+ * 	Task is not necessarily current. If not current task, then task
+ * 	is guaranteed stopped and off any cpu. No access to PMU is task
+ *	is not current.
+ */
+void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx)
+{
+	/*
+	 * cannot restore PMC if no access to PMU. Will be done
+	 * when the thread is switched back in
+	 */
+	if (task != current)
+		return;
+
+	pfm_arch_restore_pmcs(ctx, ctx->active_set);
+}
+
+/**
+ * pfm_arch_restore_pmds - reload PMD registers
+ * @ctx: context to restore from
+ * @set: current event set
+ *
+ * function called from pfm_context_load(), pfm_ctxsw()
+ *
+ * Context is locked. Interrupts are masked. Set cannot be NULL.
+ * Access to the PMU is guaranteed.
+ */
+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set)
+{
+	u16 i, num;
+
+	num = set->nused_pmds;
+
+	/*
+	 * we can restore only the PMD we use because:
+	 *
+	 * 	- can only read with pfm_read_pmds() the registers
+	 * 	  declared used via pfm_write_pmds()
+	 *
+	 * 	- if cr4.pce=1, only counters are exposed to user. RDPMC
+	 * 	  does not work with other types of PMU registers.Thus, no
+	 * 	  address is ever exposed by counters
+	 *
+	 * 	- there is never a dependency between one pmd register and
+	 * 	  another
+	 */
+	for (i = 0; num; i++) {
+		if (likely(pfm_arch_bv_test_bit(i, set->used_pmds))) {
+			pfm_write_pmd(ctx, i, set->pmds[i]);
+			num--;
+		}
+	}
+}
+
+/**
+ * pfm_arch_restore_pmcs - reload PMC registers
+ * @ctx: context to restore from
+ * @set: current event set
+ *
+ * function called from pfm_context_load(), pfm_ctxsw().
+ *
+ * Context is locked. Interrupts are masked. set cannot be NULL.
+ * Access to the PMU is guaranteed.
+ *
+ * function must restore all PMC registers from set
+ */
+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set)
+{
+	u16 i, num;
+
+	/*
+	 * we need to restore PMCs only when:
+	 * 	- context is not masked
+	 * 	- monitoring activated
+	 *
+	 * Masking monitoring after an overflow does not change the
+	 * value of flags.started
+	 */
+	if (!ctx->flags.started)
+		return;
+
+	/*
+	 * restore all pmcs
+	 *
+	 * It is not possible to restore only the pmcs we used because
+	 * certain PMU models (e.g. Pentium 4) have dependencies. Thus
+	 * we do not want one application using stale PMCs coming from
+	 * another one.
+	 *
+	 * On PMU models where there is no dependencies between PMCs, then
+	 * it is possible to optimize by only restoring the registers that
+	 * are used, but this has to be done by model-specific code.
+	 */
+	num = ctx->regs.num_pmcs;
+	for (i = 0; num; i++) {
+		if (pfm_arch_bv_test_bit(i, ctx->regs.pmcs)) {
+			pfm_arch_write_pmc(ctx, i, set->pmcs[i]);
+			num--;
+		}
+	}
+}
+
+/**
+ * smp_pmu_interrupt - lowest level PMU interrupt handler for X86
+ * @regs: machine state
+ *
+ * The PMU interrupt is handled through an interrupt gate, therefore
+ * the CPU automatically clears the EFLAGS.IF, i.e., masking interrupts.
+ *
+ * The perfmon interrupt handler MUST run with interrupts disabled due
+ * to possible race with other, higher priority interrupts, such as timer
+ * or IPI function calls.
+ *
+ * See description in IA-32 architecture manual, Vol 3 section 5.8.1
+ */
+void smp_pmu_interrupt(struct pt_regs *regs)
+{
+	unsigned long iip;
+	int using_nmi;
+
+	using_nmi = __get_cpu_var(pfm_using_nmi);
+
+	ack_APIC_irq();
+
+	irq_enter();
+
+	/*
+	 * when using NMI, pfm_handle_nmi() gets called
+	 * first. It stops monitoring and record the
+	 * iip into real_iip, then it repost the interrupt
+	 * using the lower priority vector LOCAL_PERFMON_VECTOR
+	 *
+	 * On some processors, e.g., P4, it may be that some
+	 * state is already recorded from pfm_handle_nmi()
+	 * and it only needs to be copied back into the normal
+	 * fields so it can be used transparently by higher level
+	 * code.
+	 */
+	if (using_nmi)
+		iip = __get_cpu_var(real_iip);
+	else
+		iip = instruction_pointer(regs);
+
+	pfm_interrupt_handler(iip, regs);
+
+	/*
+	 * On Intel processors:
+	 * 	- it is necessary to clear the MASK field for the LVTPC
+	 * 	  vector. Otherwise interrupts remain masked. See
+	 * 	  section 8.5.1
+	 * AMD X86-64:
+	 * 	- the documentation does not stipulate the behavior but
+	 * 	  it seems to work without the write, so we skip
+	 */
+	if (!using_nmi && current_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		apic_write(APIC_LVTPC, LOCAL_PERFMON_VECTOR);
+
+	irq_exit();
+}
+
+/**
+ * pfm_handle_nmi - PMU NMI handler notifier callback
+ * @nb ; notifier block
+ * @val: type of die notifier
+ * @data: die notifier-specific data
+ *
+ * called from notify_die() notifier from an trap handler path. We only
+ * care about NMI related callbacks, and ignore everything else.
+ *
+ * Cannot grab any locks, include the perfmon context lock
+ *
+ * Must detect if NMI interrupt comes from perfmon, and if so it must
+ * stop the PMU and repost a lower-priority interrupt. The perfmon interrupt
+ * handler needs to grab the context lock, thus is cannot be run directly
+ * from the NMI interrupt call path.
+ */
+static int __kprobes pfm_handle_nmi(struct notifier_block *nb,
+				    unsigned long val,
+				    void *data)
+{
+	struct die_args *args = data;
+	struct pfm_context *ctx;
+	struct pfm_arch_pmu_info *pmu_info;
+
+	/*
+	 * only NMI related calls
+	 */
+	if (val != DIE_NMI_IPI)
+		return NOTIFY_DONE;
+
+	/*
+	 * perfmon not using NMI
+	 */
+	if (!__get_cpu_var(pfm_using_nmi))
+		return NOTIFY_DONE;
+
+	/*
+	 * No context
+	 */
+	ctx = __get_cpu_var(pmu_ctx);
+	if (!ctx) {
+		PFM_DBG_ovfl("no ctx");
+		return NOTIFY_DONE;
+	}
+
+	/*
+	 * Detect if we have overflows, i.e., NMI interrupt
+	 * caused by PMU
+	 */
+	pmu_info = pfm_pmu_info();
+	if (!pmu_info->has_ovfls(ctx)) {
+		PFM_DBG_ovfl("no ovfl");
+		return NOTIFY_DONE;
+	}
+
+	/*
+	 * we stop the PMU to avoid further overflow before this
+	 * one is treated by lower priority interrupt handler
+	 */
+	pmu_info->quiesce();
+
+	/*
+	 * record actual instruction pointer
+	 */
+	__get_cpu_var(real_iip) = instruction_pointer(args->regs);
+
+	/*
+	 * post lower priority interrupt (LOCAL_PERFMON_VECTOR)
+	 */
+	pfm_arch_resend_irq(ctx);
+
+	/*
+	 * we need to rewrite the APIC vector on Intel
+	 */
+	if (current_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+	/*
+	 * the notification was for us
+	 */
+	return NOTIFY_STOP;
+}
+
+static struct notifier_block pfm_nmi_nb = {
+	.notifier_call = pfm_handle_nmi
+};
+
+/**
+ * pfm_arch_resend_irq - post perfmon interrupt on regular vector
+ *
+ * called from pfm_ctxswin_thread() and pfm_handle_nmi()
+ */
+void pfm_arch_resend_irq(struct pfm_context *ctx)
+{
+	unsigned long val, dest;
+	/*
+	 * we cannot use hw_resend_irq() because it goes to
+	 * the I/O APIC. We need to go to the Local APIC.
+	 *
+	 * The "int vec" is not the right solution either
+	 * because it triggers a software intr. We need
+	 * to regenerate the interrupt and have it pended
+	 * until we unmask interrupts.
+	 *
+	 * Instead we send ourself an IPI on the perfmon
+	 * vector.
+	 */
+	val  = APIC_DEST_SELF|APIC_INT_ASSERT|
+	       APIC_DM_FIXED|LOCAL_PERFMON_VECTOR;
+
+	dest = apic_read(APIC_ID);
+	apic_write(APIC_ICR2, dest);
+	apic_write(APIC_ICR, val);
+}
+
+/**
+ * pfm_arch_pmu_acquire_percpu - setup APIC per CPU
+ * @data: contains pmu flags
+ */
+static void pfm_arch_pmu_acquire_percpu(void *data)
+{
+	struct pfm_arch_pmu_info *pmu_info;
+	unsigned int tmp, vec;
+	unsigned long flags = (unsigned long)data;
+	unsigned long lvtpc;
+
+	pmu_info = pfm_pmu_conf->pmu_info;
+	/*
+	 * we only reprogram the LVTPC vector if we have detected
+	 * no sharing, otherwise it means the APIC is already programmed
+	 * and we use whatever vector (likely NMI) is there
+	 */
+	if (!(flags & PFM_X86_FL_SHARING)) {
+		vec = LOCAL_PERFMON_VECTOR;
+
+		tmp = apic_read(APIC_LVTERR);
+		apic_write(APIC_LVTERR, tmp | APIC_LVT_MASKED);
+		apic_write(APIC_LVTPC, vec);
+		apic_write(APIC_LVTERR, tmp);
+	}
+	lvtpc = (unsigned long)apic_read(APIC_LVTPC);
+
+	__get_cpu_var(pfm_using_nmi) = lvtpc == APIC_DM_NMI;
+
+	PFM_DBG("LTVPC=0x%lx using_nmi=%d",
+			lvtpc, __get_cpu_var(pfm_using_nmi));
+	/*
+	 * invoke model specific acquire routine.
+	 */
+	if (pmu_info->acquire_pmu_percpu)
+		pmu_info->acquire_pmu_percpu();
+}
+
+/**
+ * pfm_arch_pmu_acquire - acquire PMU resource from system
+ * @unavail_pmcs : bitmask to use to set unavailable pmcs
+ * @unavail_pmds : bitmask to use to set unavailable pmds
+ *
+ * interrupts are not masked
+ *
+ * Grab PMU registers from lower level MSR allocator
+ *
+ * Program the APIC according the possible interrupt vector
+ * either LOCAL_PERFMON_VECTOR or NMI
+ */
+int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds)
+{
+	struct pfm_arch_pmu_info *pmu_info;
+	struct pfm_regmap_desc *d;
+	u16 i, nlost;
+
+	pmu_info = pfm_pmu_conf->pmu_info;
+	pmu_info->flags &= ~PFM_X86_FL_SHARING;
+
+	nlost = 0;
+
+	d = pfm_pmu_conf->pmc_desc;
+	for (i = 0; i < pfm_pmu_conf->num_pmc_entries;  i++, d++) {
+		if (!(d->type & PFM_REG_I))
+			continue;
+
+		/*
+		 * reserve register with lower-level allocator
+		 */
+		if (!reserve_evntsel_nmi(d->hw_addr)) {
+			PFM_DBG("pmc%d(%s) already used", i, d->desc);
+			pfm_arch_bv_set_bit(i, unavail_pmcs);
+			nlost++;
+			continue;
+		}
+	}
+	PFM_DBG("nlost=%d info_flags=0x%x\n", nlost, pmu_info->flags);
+	/*
+	 * some PMU models (e.g., P6) do not support sharing
+	 * so check if we found less than the expected number of PMC registers
+	 */
+	if (nlost) {
+		if (pmu_info->flags & PFM_X86_FL_NO_SHARING) {
+			PFM_INFO("PMU already used by another subsystem, "
+				 "PMU does not support sharing, "
+				 "try disabling Oprofile or "
+				 "reboot with nmi_watchdog=0");
+			goto undo;
+		}
+		pmu_info->flags |= PFM_X86_FL_SHARING;
+	}
+
+	d = pfm_pmu_conf->pmd_desc;
+	for (i = 0; i < pfm_pmu_conf->num_pmd_entries;  i++, d++) {
+		if (!(d->type & PFM_REG_I))
+			continue;
+
+		if (!reserve_perfctr_nmi(d->hw_addr)) {
+			PFM_DBG("pmd%d(%s) already used", i, d->desc);
+			pfm_arch_bv_set_bit(i, unavail_pmds);
+		}
+	}
+	/*
+	 * program APIC on each CPU
+	 */
+	on_each_cpu(pfm_arch_pmu_acquire_percpu,
+		    (void *)(unsigned long)pmu_info->flags , 1);
+
+	return 0;
+undo:
+	/*
+	 * must undo reservation of pmcs in case of error
+	 */
+	d = pfm_pmu_conf->pmc_desc;
+	for (i = 0; i < pfm_pmu_conf->num_pmc_entries;  i++, d++) {
+		if (!(d->type & PFM_REG_I))
+			continue;
+		if (!pfm_arch_bv_test_bit(i, unavail_pmcs))
+			release_evntsel_nmi(d->hw_addr);
+	}
+	return -EBUSY;
+}
+
+/**
+ * pfm-arch_pmu_release_percpu - clear NMI state for one CPU
+ *
+ */
+static void pfm_arch_pmu_release_percpu(void *data)
+{
+	struct pfm_arch_pmu_info *pmu_info;
+
+	pmu_info = pfm_pmu_conf->pmu_info;
+
+	__get_cpu_var(pfm_using_nmi) = 0;
+	/*
+	 * invoke model specific release routine.
+	 */
+	if (pmu_info->release_pmu_percpu)
+		pmu_info->release_pmu_percpu();
+}
+
+/**
+ * pfm_arch_pmu_release - release PMU resource to system
+ *
+ * called from pfm_pmu_release()
+ * interrupts are not masked
+ *
+ * On x86, we return the PMU registers to the MSR allocator
+ */
+void pfm_arch_pmu_release(void)
+{
+	struct pfm_regmap_desc *d;
+	u16 i, n;
+
+	d = pfm_pmu_conf->pmc_desc;
+	n = pfm_pmu_conf->regs_all.num_pmcs;
+	for (i = 0; n; i++, d++) {
+		if (!pfm_arch_bv_test_bit(i, pfm_pmu_conf->regs_all.pmcs))
+			continue;
+		release_evntsel_nmi(d->hw_addr);
+		n--;
+		PFM_DBG("pmc%u released", i);
+	}
+	d = pfm_pmu_conf->pmd_desc;
+	n = pfm_pmu_conf->regs_all.num_pmds;
+	for (i = 0; n; i++, d++) {
+		if (!pfm_arch_bv_test_bit(i, pfm_pmu_conf->regs_all.pmds))
+			continue;
+		release_perfctr_nmi(d->hw_addr);
+		n--;
+		PFM_DBG("pmd%u released", i);
+	}
+
+	/* clear NMI variable if used */
+	if (__get_cpu_var(pfm_using_nmi))
+		on_each_cpu(pfm_arch_pmu_release_percpu, NULL , 1);
+}
+
+/**
+ * pfm_arch_init - one time global arch-specific initialization
+ *
+ * called from pfm_init()
+ */
+int __init pfm_arch_init(void)
+{
+	/*
+	 * we need to register our NMI handler when the kernels boots
+	 * to avoid a deadlock condition with the NMI watchdog or Oprofile
+	 * if we were to try and register/unregister on-demand.
+	 */
+	register_die_notifier(&pfm_nmi_nb);
+	return 0;
+}
diff --git a/arch/x86/perfmon/perfmon_amd64.c b/arch/x86/perfmon/perfmon_amd64.c
new file mode 100644
index 000000000000..f078fe28137d
--- /dev/null
+++ b/arch/x86/perfmon/perfmon_amd64.c
@@ -0,0 +1,483 @@
+/*
+ * This file contains the PMU description for the Athlon64 and Opteron64
+ * processors. It supports 32 and 64-bit modes.
+ *
+ * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Copyright (c) 2007 Advanced Micro Devices, Inc.
+ * Contributed by Robert Richter <robert.richter@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#include <linux/kprobes.h>
+#include <linux/vmalloc.h>
+#include <linux/topology.h>
+#include <linux/pci.h>
+#include <linux/perfmon_kern.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+
+static void __kprobes pfm_amd64_quiesce(void);
+static int pfm_amd64_has_ovfls(struct pfm_context *ctx);
+static int pfm_amd64_stop_save(struct pfm_context *ctx,
+			       struct pfm_event_set *set);
+
+static u64 enable_mask[PFM_MAX_PMCS];
+static u16 max_enable;
+
+static struct pfm_arch_pmu_info pfm_amd64_pmu_info = {
+	.stop_save = pfm_amd64_stop_save,
+	.has_ovfls = pfm_amd64_has_ovfls,
+	.quiesce = pfm_amd64_quiesce,
+};
+
+/*
+ * force Local APIC interrupt on overflow
+ */
+#define PFM_K8_VAL	(1ULL<<20)
+#define PFM_K8_NO64	(1ULL<<20)
+
+/*
+ * reserved bits must be 1
+ *
+ * for family 15:
+ * - upper 32 bits are reserved
+ * - bit 20, bit 21
+ *
+ * for family 16:
+ * - bits 36-39 are reserved
+ * - bits 42-63 are reserved
+ * - bit 20, bit 21
+ *
+ */
+#define PFM_K8_RSVD 	((~((1ULL<<32)-1)) | (1ULL<<20) | (1ULL<<21))
+#define PFM_16_RSVD ((0x3fffffULL<<42) | (0xfULL<<36) | (1ULL<<20) | (1ULL<<21))
+
+static struct pfm_regmap_desc pfm_amd64_pmc_desc[] = {
+/* pmc0  */ PMC_D(PFM_REG_I64, "PERFSEL0", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL0),
+/* pmc1  */ PMC_D(PFM_REG_I64, "PERFSEL1", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL1),
+/* pmc2  */ PMC_D(PFM_REG_I64, "PERFSEL2", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL2),
+/* pmc3  */ PMC_D(PFM_REG_I64, "PERFSEL3", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL3),
+};
+#define PFM_AMD_NUM_PMCS ARRAY_SIZE(pfm_amd64_pmc_desc)
+
+/*
+ * AMD64 counters are 48 bits, upper bits are reserved
+ */
+#define PFM_AMD64_CTR_RSVD	(~((1ULL<<48)-1))
+
+#define PFM_AMD_D(n) \
+	{ .type = PFM_REG_C,			\
+	  .desc = "PERFCTR"#n,			\
+	  .hw_addr = MSR_K7_PERFCTR0+n,		\
+	  .rsvd_msk = PFM_AMD64_CTR_RSVD,	\
+	  .dep_pmcs[0] = 1ULL << n		\
+	}
+
+static struct pfm_regmap_desc pfm_amd64_pmd_desc[] = {
+/* pmd0  */ PFM_AMD_D(0),
+/* pmd1  */ PFM_AMD_D(1),
+/* pmd2  */ PFM_AMD_D(2),
+/* pmd3  */ PFM_AMD_D(3)
+};
+#define PFM_AMD_NUM_PMDS ARRAY_SIZE(pfm_amd64_pmd_desc)
+
+static struct pfm_context *pfm_nb_task_owner;
+
+static struct pfm_pmu_config pfm_amd64_pmu_conf;
+
+/**
+ * pfm_amd64_acquire_nb -- ensure mutual exclusion for Northbridge events
+ * @ctx: context to use
+ *
+ * There can only be one user per socket for the Northbridge (NB) events,
+ * so we enforce mutual exclusion as follows:
+ * 	- per-thread : only one context machine-wide can use NB events
+ *
+ * Exclusion is enforced at:
+ * 	- pfm_load_context()
+ * 	- pfm_write_pmcs() for attached contexts
+ *
+ * Exclusion is released at:
+ * 	- pfm_unload_context() or any calls that implicitely uses it
+ *
+ * return:
+ * 	0  : successfully acquire NB access
+ * 	< 0:  errno, failed to acquire NB access
+ */
+static int pfm_amd64_acquire_nb(struct pfm_context *ctx)
+{
+	struct pfm_context **entry, *old;
+	int proc_id;
+
+#ifdef CONFIG_SMP
+	proc_id = cpu_data(smp_processor_id()).phys_proc_id;
+#else
+	proc_id = 0;
+#endif
+
+	entry = &pfm_nb_task_owner;
+
+	old = cmpxchg(entry, NULL, ctx);
+	if (!old) {
+		PFM_DBG("acquired Northbridge event access globally");
+	} else if (old != ctx) {
+		PFM_DBG("global NorthBridge event conflict");
+		return -EBUSY;
+	}
+	return 0;
+}
+
+/**
+ * pfm_amd64_pmc_write_check -- check validity of pmc writes
+ * @ctx: context to use
+ * @set: event set to use
+ * @req: user request to modify the pmc
+ *
+ * invoked from pfm_write_pmcs() when pfm_nb_sys_owners is not NULL,i.e.,
+ * when we have detected a multi-core processor.
+ *
+ * context is locked, interrupts are masked
+ */
+static int pfm_amd64_pmc_write_check(struct pfm_context *ctx,
+			     struct pfm_event_set *set,
+			     struct pfarg_pmr *req)
+{
+	unsigned int event;
+
+	/*
+	 * delay checking NB event until we load the context
+	 */
+	if (ctx->state == PFM_CTX_UNLOADED)
+		return 0;
+
+	/*
+	 * check event is NB event
+	 */
+	event = (unsigned int)(req->reg_value & 0xff);
+	if (event < 0xee)
+		return 0;
+
+	return pfm_amd64_acquire_nb(ctx);
+}
+
+/**
+ * pfm_amd64_load_context - amd64 model-specific load callback
+ * @ctx: context to use
+ *
+ * invoked on pfm_load_context().
+ * context is locked, interrupts are masked
+ */
+static int pfm_amd64_load_context(struct pfm_context *ctx)
+{
+	struct pfm_event_set *set;
+	unsigned int i, n;
+
+	set = ctx->active_set;
+	n = set->nused_pmcs;
+	for (i = 0; n; i++) {
+		if (!pfm_arch_bv_test_bit(i, set->used_pmcs))
+			continue;
+
+		if ((set->pmcs[i] & 0xff) >= 0xee)
+			goto found;
+		n--;
+	}
+	return 0;
+found:
+	return pfm_amd64_acquire_nb(ctx);
+}
+
+/**
+ * pfm_amd64_unload_context -- amd64 mdoels-specific unload callback
+ * @ctx: context to use
+ *
+ * invoked on pfm_unload_context()
+ */
+static void pfm_amd64_unload_context(struct pfm_context *ctx)
+{
+	struct pfm_context **entry, *old;
+	int proc_id;
+
+#ifdef CONFIG_SMP
+	proc_id = cpu_data(smp_processor_id()).phys_proc_id;
+#else
+	proc_id = 0;
+#endif
+
+	entry = &pfm_nb_task_owner;
+
+	old = cmpxchg(entry, ctx, NULL);
+	if (old == ctx)
+		PFM_DBG("released NorthBridge events globally");
+}
+
+/**
+ * pfm_amd64_setup_nb_event_ctrl -- initialize NB event controls
+ *
+ * detect if we need to activate NorthBridge event access control
+ */
+static int pfm_amd64_setup_nb_event_ctrl(void)
+{
+	unsigned int c, n = 0;
+	unsigned int max_phys = 0;
+
+#ifdef CONFIG_SMP
+	for_each_possible_cpu(c) {
+		if (cpu_data(c).phys_proc_id > max_phys)
+			max_phys = cpu_data(c).phys_proc_id;
+	}
+#else
+	max_phys = 0;
+#endif
+	if (max_phys > 255) {
+		PFM_INFO("socket id %d is too big to handle", max_phys);
+		return -ENOMEM;
+	}
+
+	n = max_phys + 1;
+	if (n < 2)
+		return 0;
+
+	pfm_nb_task_owner = NULL;
+
+	/*
+	 * activate write-checker for PMC registers
+	 */
+	for (c = 0; c < PFM_AMD_NUM_PMCS; c++)
+		pfm_amd64_pmc_desc[c].type |= PFM_REG_WC;
+
+	pfm_amd64_pmu_info.load_context = pfm_amd64_load_context;
+	pfm_amd64_pmu_info.unload_context = pfm_amd64_unload_context;
+
+	pfm_amd64_pmu_conf.pmc_write_check = pfm_amd64_pmc_write_check;
+
+	PFM_INFO("NorthBridge event access control enabled");
+
+	return 0;
+}
+
+/**
+ * pfm_amd64_setup_register -- initialize register table
+ *
+ * modify register table based on actual host CPU
+ */
+static void pfm_amd64_setup_registers(void)
+{
+	u16 i;
+
+	pfm_arch_bv_set_bit(0, enable_mask);
+	pfm_arch_bv_set_bit(1, enable_mask);
+	pfm_arch_bv_set_bit(2, enable_mask);
+	pfm_arch_bv_set_bit(3, enable_mask);
+	max_enable = 3+1;
+
+	/*
+	 * adjust reserved bit fields for family 16
+	 */
+	if (current_cpu_data.x86 == 16) {
+		for (i = 0; i < PFM_AMD_NUM_PMCS; i++)
+			if (pfm_amd64_pmc_desc[i].rsvd_msk == PFM_K8_RSVD)
+				pfm_amd64_pmc_desc[i].rsvd_msk = PFM_16_RSVD;
+	}
+}
+
+/**
+ * pfm_amd64_probe_pmu -- detect host PMU
+ */
+static int pfm_amd64_probe_pmu(void)
+{
+	if (current_cpu_data.x86_vendor != X86_VENDOR_AMD)
+		return -1;
+
+	switch (current_cpu_data.x86) {
+	case  6:
+	case 15:
+	case 16:
+		PFM_INFO("found family=%d", current_cpu_data.x86);
+		break;
+	default:
+		PFM_INFO("unsupported family=%d", current_cpu_data.x86);
+		return -1;
+	}
+
+	/*
+	 * check for local APIC (required)
+	 */
+	if (!cpu_has_apic) {
+		PFM_INFO("no local APIC, unsupported");
+		return -1;
+	}
+
+	if (current_cpu_data.x86_max_cores > 1
+	    && pfm_amd64_setup_nb_event_ctrl())
+		return -1;
+
+	pfm_amd64_setup_registers();
+
+	return 0;
+}
+
+/**
+ * pfm_amd64_has_ovfls -- detect if pending overflows
+ * @ctx: context to use
+ *
+ * detect is counters have overflowed.
+ * return:
+ * 	0 : no overflow
+ * 	1 : at least one overflow
+ */
+static int __kprobes pfm_amd64_has_ovfls(struct pfm_context *ctx)
+{
+	struct pfm_regmap_desc *xrd;
+	u64 *cnt_mask;
+	u64 wmask, val;
+	u16 i, num;
+
+	/*
+	 * Check regular counters
+	 */
+	cnt_mask = ctx->regs.cnt_pmds;
+	num = ctx->regs.num_counters;
+	wmask = 1ULL << pfm_pmu_conf->counter_width;
+	xrd = pfm_amd64_pmd_desc;
+
+	for (i = 0; num; i++) {
+		if (pfm_arch_bv_test_bit(i, cnt_mask)) {
+			rdmsrl(xrd[i].hw_addr, val);
+			if (!(val & wmask))
+				return 1;
+			num--;
+		}
+	}
+	return 0;
+}
+
+/**
+ * pfm_amd64_stop_save - stop monitoring, collect pending overflows
+ * @ctx: context to use
+ * @set: event set to stop
+ *
+ * interrupts are masked, PMU access guaranteed
+ */
+static int pfm_amd64_stop_save(struct pfm_context *ctx,
+			       struct pfm_event_set *set)
+{
+	struct pfm_arch_pmu_info *pmu_info;
+	u64 used_mask[PFM_PMC_BV];
+	u64 *cnt_pmds;
+	u64 val, wmask, ovfl_mask;
+	u32 i, count;
+
+	pmu_info = pfm_pmu_info();
+
+	wmask = 1ULL << pfm_pmu_conf->counter_width;
+
+	pfm_arch_bv_and(used_mask,
+		        set->used_pmcs,
+		        enable_mask,
+		        max_enable);
+
+	count = pfm_arch_bv_weight(used_mask, max_enable);
+
+	/*
+	 * stop monitoring
+	 * Unfortunately, this is very expensive!
+	 * wrmsrl() is serializing.
+	 */
+	for (i = 0; count; i++) {
+		if (pfm_arch_bv_test_bit(i, used_mask)) {
+			wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0);
+			count--;
+		}
+	}
+
+	/*
+	 * if we already having a pending overflow condition, we simply
+	 * return to take care of this first.
+	 */
+	if (set->npend_ovfls)
+		return 1;
+
+	ovfl_mask = pfm_pmu_conf->ovfl_mask;
+	cnt_pmds = ctx->regs.cnt_pmds;
+
+	/*
+	 * check for pending overflows and save PMDs (combo)
+	 * we employ used_pmds because we also need to save
+	 * and not just check for pending interrupts.
+	 */
+	count = set->nused_pmds;
+	for (i = 0; count; i++) {
+		if (pfm_arch_bv_test_bit(i, set->used_pmds)) {
+			val = pfm_arch_read_pmd(ctx, i);
+			if (likely(pfm_arch_bv_test_bit(i, cnt_pmds))) {
+				if (!(val & wmask)) {
+					pfm_arch_bv_set_bit(i,set->povfl_pmds);
+					set->npend_ovfls++;
+				}
+				val = (set->pmds[i] & ~ovfl_mask)
+				    | (val & ovfl_mask);
+			}
+			set->pmds[i] = val;
+			count--;
+		}
+	}
+	/* 0 means: no need to save PMDs at upper level */
+	return 0;
+}
+
+/**
+ * pfm_amd64_quiesce_pmu -- stop monitoring without grabbing any lock
+ *
+ * called from NMI interrupt handler to immediately stop monitoring
+ * cannot grab any lock, including perfmon related locks
+ */
+static void __kprobes pfm_amd64_quiesce(void)
+{
+	/*
+	 * quiesce PMU by clearing available registers that have
+	 * the start/stop capability
+	 */
+	if (pfm_arch_bv_test_bit(0, pfm_pmu_conf->regs_all.pmcs))
+		wrmsrl(MSR_K7_EVNTSEL0, 0);
+	if (pfm_arch_bv_test_bit(1, pfm_pmu_conf->regs_all.pmcs))
+		wrmsrl(MSR_K7_EVNTSEL0+1, 0);
+	if (pfm_arch_bv_test_bit(2, pfm_pmu_conf->regs_all.pmcs))
+		wrmsrl(MSR_K7_EVNTSEL0+2, 0);
+	if (pfm_arch_bv_test_bit(3, pfm_pmu_conf->regs_all.pmcs))
+		wrmsrl(MSR_K7_EVNTSEL0+3, 0);
+}
+
+static struct pfm_pmu_config pfm_amd64_pmu_conf = {
+	.pmu_name = "AMD64",
+	.counter_width = 47,
+	.pmd_desc = pfm_amd64_pmd_desc,
+	.pmc_desc = pfm_amd64_pmc_desc,
+	.num_pmc_entries = PFM_AMD_NUM_PMCS,
+	.num_pmd_entries = PFM_AMD_NUM_PMDS,
+	.version = "1.2",
+	.pmu_info = &pfm_amd64_pmu_info
+};
+
+static int __init pfm_amd64_pmu_init_module(void)
+{
+	if (pfm_amd64_probe_pmu())
+		return -ENOSYS;
+	return pfm_pmu_register(&pfm_amd64_pmu_conf);
+}
+
+device_initcall(pfm_amd64_pmu_init_module);
diff --git a/arch/x86/perfmon/perfmon_intel_arch.c b/arch/x86/perfmon/perfmon_intel_arch.c
new file mode 100644
index 000000000000..ce4293dcfcda
--- /dev/null
+++ b/arch/x86/perfmon/perfmon_intel_arch.c
@@ -0,0 +1,628 @@
+/*
+ * This file contains the Intel architectural perfmon v1, v2, v3
+ * description tables.
+ *
+ * Architectural perfmon was introduced with Intel Core Solo/Duo
+ * processors.
+ *
+ * Copyright (c) 2006-2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kprobes.h>
+#include <linux/perfmon_kern.h>
+#include <asm/msr.h>
+#include <asm/apic.h>
+
+static u64 enable_mask[PFM_MAX_PMCS];
+static u16 max_enable;
+static int pfm_intel_arch_version;
+
+DEFINE_PER_CPU(u64, saved_global_ctrl);
+
+/*
+ * - upper 32 bits are reserved
+ * - INT: APIC enable bit is reserved (forced to 1)
+ * - bit 21 is reserved
+ *
+ * RSVD: reserved bits are 1
+ */
+#define PFM_IA_PMC_RSVD	((~((1ULL<<32)-1)) \
+			| (1ULL<<20) \
+			| (1ULL<<21))
+
+/*
+ * force Local APIC interrupt on overflow
+ * disable with NO_EMUL64
+ */
+#define PFM_IA_PMC_VAL	(1ULL<<20)
+#define PFM_IA_NO64	(1ULL<<20)
+
+/*
+ * architectuture specifies that:
+ * IA32_PMCx MSR        : starts at 0x0c1 & occupy a contiguous block of MSR
+ * IA32_PERFEVTSELx MSR : starts at 0x186 & occupy a contiguous block of MSR
+ * MSR_GEN_FIXED_CTR0   : starts at 0x309 & occupy a contiguous block of MSR
+ */
+#define MSR_GEN_SEL_BASE	MSR_P6_EVNTSEL0
+#define MSR_GEN_PMC_BASE	MSR_P6_PERFCTR0
+#define MSR_GEN_FIXED_PMC_BASE	MSR_CORE_PERF_FIXED_CTR0
+
+/*
+ * layout of EAX for CPUID.0xa leaf function
+ */
+struct pmu_eax {
+	unsigned int version:8;		/* architectural perfmon version */
+	unsigned int num_cnt:8; 	/* number of generic counters */
+	unsigned int cnt_width:8;	/* width of generic counters */
+	unsigned int ebx_length:8;	/* number of architected events */
+};
+
+/*
+ * layout of EDX for CPUID.0xa leaf function when perfmon v2 is detected
+ */
+struct pmu_edx {
+	unsigned int num_cnt:5;		/* number of fixed counters */
+	unsigned int cnt_width:8;	/* width of fixed counters */
+	unsigned int reserved:19;
+};
+
+static void pfm_intel_arch_acquire_pmu_percpu(void);
+static void pfm_intel_arch_release_pmu_percpu(void);
+static int pfm_intel_arch_stop_save(struct pfm_context *ctx,
+				    struct pfm_event_set *set);
+static int pfm_intel_arch_has_ovfls(struct pfm_context *ctx);
+static void __kprobes pfm_intel_arch_quiesce(void);
+
+/*
+ * physical addresses of MSR controlling the perfevtsel and counter registers
+ */
+struct pfm_arch_pmu_info pfm_intel_arch_pmu_info = {
+	.stop_save = pfm_intel_arch_stop_save,
+	.has_ovfls = pfm_intel_arch_has_ovfls,
+	.quiesce = pfm_intel_arch_quiesce,
+	.acquire_pmu_percpu = pfm_intel_arch_acquire_pmu_percpu,
+	.release_pmu_percpu = pfm_intel_arch_release_pmu_percpu
+};
+
+#define PFM_IA_C(n) {                   \
+	.type = PFM_REG_I64,            \
+	.desc = "PERFEVTSEL"#n,         \
+	.dfl_val = PFM_IA_PMC_VAL,      \
+	.rsvd_msk = PFM_IA_PMC_RSVD,    \
+	.no_emul64_msk = PFM_IA_NO64,   \
+	.hw_addr = MSR_GEN_SEL_BASE+(n) \
+	}
+
+#define PFM_IA_D(n) \
+	{ .type = PFM_REG_C,			\
+	  .desc = "PMC"#n,			\
+	  .hw_addr = MSR_P6_PERFCTR0+n,		\
+	  .dep_pmcs[0] = 1ULL << n		\
+	}
+
+#define PFM_IA_FD(n) \
+	{ .type = PFM_REG_C,			\
+	  .desc = "FIXED_CTR"#n,		\
+	  .hw_addr = MSR_CORE_PERF_FIXED_CTR0+n,\
+	  .dep_pmcs[0] = 1ULL << 16		\
+	}
+
+
+static struct pfm_regmap_desc pfm_intel_arch_pmc_desc[] = {
+/* pmc0  */ PFM_IA_C(0),  PFM_IA_C(1),   PFM_IA_C(2),  PFM_IA_C(3),
+/* pmc4  */ PFM_IA_C(4),  PFM_IA_C(5),   PFM_IA_C(6),  PFM_IA_C(7),
+/* pmc8  */ PFM_IA_C(8),  PFM_IA_C(9),  PFM_IA_C(10), PFM_IA_C(11),
+/* pmc12 */ PFM_IA_C(12), PFM_IA_C(13), PFM_IA_C(14), PFM_IA_C(15),
+
+/* pmc16 */ { .type = PFM_REG_I,
+	      .desc = "FIXED_CTRL",
+	      .dfl_val = 0x8888888888888888ULL, /* force PMI */
+	      .rsvd_msk = 0, /* set dynamically */
+	      .no_emul64_msk = 0,
+	      .hw_addr = MSR_CORE_PERF_FIXED_CTR_CTRL
+	    },
+};
+#define PFM_IA_MAX_PMCS	ARRAY_SIZE(pfm_intel_arch_pmc_desc)
+
+static struct pfm_regmap_desc pfm_intel_arch_pmd_desc[] = {
+/* pmd0  */  PFM_IA_D(0),  PFM_IA_D(1),  PFM_IA_D(2),  PFM_IA_D(3),
+/* pmd4  */  PFM_IA_D(4),  PFM_IA_D(5),  PFM_IA_D(6),  PFM_IA_D(7),
+/* pmd8  */  PFM_IA_D(8),  PFM_IA_D(9), PFM_IA_D(10), PFM_IA_D(11),
+/* pmd12 */ PFM_IA_D(12), PFM_IA_D(13), PFM_IA_D(14), PFM_IA_D(15),
+
+/* pmd16 */ PFM_IA_FD(0), PFM_IA_FD(1), PFM_IA_FD(2), PFM_IA_FD(3),
+/* pmd20 */ PFM_IA_FD(4), PFM_IA_FD(5), PFM_IA_FD(6), PFM_IA_FD(7),
+/* pmd24 */ PFM_IA_FD(8), PFM_IA_FD(9), PFM_IA_FD(10), PFM_IA_FD(11),
+/* pmd28 */ PFM_IA_FD(16), PFM_IA_FD(17), PFM_IA_FD(18), PFM_IA_FD(19)
+};
+#define PFM_IA_MAX_PMDS	ARRAY_SIZE(pfm_intel_arch_pmd_desc)
+
+#define PFM_IA_MAX_CNT		16 /* # generic counters in mapping table */
+#define PFM_IA_MAX_FCNT		16 /* # of fixed counters in mapping table */
+#define PFM_IA_FCNT_BASE	16 /* base index of fixed counters PMD */
+
+static struct pfm_pmu_config pfm_intel_arch_pmu_conf;
+
+static void pfm_intel_arch_check_errata(void)
+{
+	/*
+	 * Core Duo errata AE49 (no fix). Both counters share a single
+	 * enable bit in PERFEVTSEL0
+	 */
+	if (current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 14)
+		pfm_intel_arch_pmu_info.flags |= PFM_X86_FL_NO_SHARING;
+}
+
+static inline void set_enable_mask(unsigned int i)
+{
+	pfm_arch_bv_set_bit(i, enable_mask);
+
+	/* max_enable = highest + 1 */
+	if ((i+1) > max_enable)
+		max_enable = i+ 1;
+}
+
+static void pfm_intel_arch_setup_generic(unsigned int version,
+					 unsigned int width,
+					 unsigned int count)
+{
+	u64 rsvd;
+	unsigned int i;
+
+	/*
+	 * first we handle the generic counters:
+	 *
+	 * - ensure HW does not have more registers than hardcoded in the tables
+	 * - adjust rsvd_msk to actual counter width
+	 * - initialize enable_mask (list of PMC with start/stop capability)
+	 * - mark unused hardcoded generic counters as unimplemented
+	 */
+
+	/*
+	 * min of number of Hw counters and hardcoded in the tables
+	 */
+	if (count >= PFM_IA_MAX_CNT) {
+		printk(KERN_INFO "perfmon: Limiting number of generic counters"
+				 " to %u, HW supports %u",
+				 PFM_IA_MAX_CNT, count);
+		count = PFM_IA_MAX_CNT;
+	}
+
+	/*
+	 * adjust rsvd_msk for generic counters based on actual width
+	 * initialize enable_mask (1 per pmd)
+	 */
+	rsvd = ~((1ULL << width)-1);
+	for (i = 0; i < count; i++) {
+		pfm_intel_arch_pmd_desc[i].rsvd_msk = rsvd;
+		set_enable_mask(i);
+	}
+
+	/*
+	 * handle version 3 new anythread bit (21)
+	 */
+	if (version == 3) {
+		for (i = 0; i < count; i++)
+			pfm_intel_arch_pmc_desc[i].rsvd_msk &= ~(1ULL << 21);
+	}
+
+
+	/*
+	 * mark unused generic counters as not available
+	 */
+	for (i = count ; i < PFM_IA_MAX_CNT; i++) {
+		pfm_intel_arch_pmd_desc[i].type = PFM_REG_NA;
+		pfm_intel_arch_pmc_desc[i].type = PFM_REG_NA;
+	}
+}
+
+static void pfm_intel_arch_setup_fixed(unsigned int version,
+				       unsigned int width,
+				       unsigned int count)
+{
+	u64 rsvd, dfl;
+	unsigned int i;
+
+	/*
+	 * handle the fixed counters (if any):
+	 *
+	 * - ensure HW does not have more registers than hardcoded in the tables
+	 * - adjust rsvd_msk to actual counter width
+	 * - initialize enable_mask (list of PMC with start/stop capability)
+	 * - mark unused hardcoded generic counters as unimplemented
+	 */
+	if (count >= PFM_IA_MAX_FCNT) {
+		printk(KERN_INFO "perfmon: Limiting number of fixed counters"
+				 " to %u, HW supports %u",
+				 PFM_IA_MAX_FCNT, count);
+		count = PFM_IA_MAX_FCNT;
+	}
+	/*
+	 * adjust rsvd_msk for fixed counters based on actual width
+	 */
+	rsvd = ~((1ULL << width)-1);
+	for (i = 0; i < count; i++)
+		pfm_intel_arch_pmd_desc[PFM_IA_FCNT_BASE+i].rsvd_msk = rsvd;
+
+	/*
+	 * handle version new anythread bit (bit 2)
+	 */
+	if (version == 3)
+		rsvd = 1ULL << 3;
+	else
+		rsvd = 3ULL << 2;
+
+	pfm_intel_arch_pmc_desc[16].rsvd_msk = 0;
+	for (i = 0; i < count; i++)
+		pfm_intel_arch_pmc_desc[16].rsvd_msk |= rsvd << (i<<2);
+
+	/*
+	 * mark unused fixed counters as unimplemented
+	 *
+	 * update the rsvd_msk, dfl_val in FIXED_CTRL:
+	 * 	- rsvd_msk: set all 4 bits
+	 *	- dfl_val : clear all 4 bits
+	 */
+	dfl = pfm_intel_arch_pmc_desc[16].dfl_val;
+	rsvd = pfm_intel_arch_pmc_desc[16].rsvd_msk;
+
+	for (i = count ; i < PFM_IA_MAX_FCNT; i++) {
+		pfm_intel_arch_pmd_desc[PFM_IA_FCNT_BASE+i].type = PFM_REG_NA;
+		rsvd |= 0xfULL << (i<<2);
+		dfl &= ~(0xfULL << (i<<2));
+	}
+
+	/*
+	 * FIXED_CTR_CTRL unavailable when no fixed counters are defined
+	 */
+	if (!count) {
+		pfm_intel_arch_pmc_desc[16].type = PFM_REG_NA;
+	} else {
+		/* update rsvd_mask and dfl_val */
+		pfm_intel_arch_pmc_desc[16].rsvd_msk = rsvd;
+		pfm_intel_arch_pmc_desc[16].dfl_val = dfl;
+		set_enable_mask(16);
+	}
+}
+
+static int pfm_intel_arch_probe_pmu(void)
+{
+	union {
+		unsigned int val;
+		struct pmu_eax eax;
+		struct pmu_edx edx;
+	} eax, edx;
+	unsigned int ebx, ecx;
+	unsigned int width = 0;
+
+	edx.val = 0;
+
+	if (!cpu_has_arch_perfmon) {
+		PFM_INFO("no support for Intel architectural PMU");
+		return -1;
+	}
+
+	if (!cpu_has_apic) {
+		PFM_INFO("no Local APIC, try rebooting with lapic option");
+		return -1;
+	}
+
+	/* cpuid() call protected by cpu_has_arch_perfmon */
+	cpuid(0xa, &eax.val, &ebx, &ecx, &edx.val);
+
+	/*
+	 * some 6/15 models have buggy BIOS
+	 */
+	if (eax.eax.version == 0
+	    && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 15) {
+		PFM_INFO("buggy v2 BIOS, adjusting for 2 generic counters");
+		eax.eax.version = 2;
+		eax.eax.num_cnt = 2;
+		eax.eax.cnt_width = 40;
+	}
+
+	/*
+	 * some v2 BIOSes are incomplete
+	 */
+	if (eax.eax.version == 2 && !edx.edx.num_cnt) {
+		PFM_INFO("buggy v2 BIOS, adjusting for 3 fixed counters");
+		edx.edx.num_cnt = 3;
+		edx.edx.cnt_width = 40;
+	}
+
+	/*
+	 * no fixed counters on earlier versions
+	 */
+	if (eax.eax.version < 2) {
+		edx.val = 0;
+	} else {
+		/*
+		 * use the min value of both widths until we support
+		 * variable width counters
+		 */
+		width = eax.eax.cnt_width < edx.edx.cnt_width ?
+			eax.eax.cnt_width : edx.edx.cnt_width;
+	}
+
+ 	/*
+	 * Intel Atom processors have a buggy firmware which does not report
+	 * the correct number of fixed counters
+	 */
+	if (eax.eax.version == 3 && edx.edx.num_cnt < 3
+	    && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 28) {
+		PFM_INFO("buggy v3 BIOS, adjusting for 3 fixed counters");
+		edx.edx.num_cnt = 3;
+	}
+
+	PFM_INFO("detected architecural perfmon v%d", eax.eax.version);
+	PFM_INFO("num_gen=%d width=%d num_fixed=%d width=%d",
+		  eax.eax.num_cnt,
+		  eax.eax.cnt_width,
+		  edx.edx.num_cnt,
+		  edx.edx.cnt_width);
+
+	pfm_intel_arch_setup_generic(eax.eax.version,
+				     width,
+				     eax.eax.num_cnt);
+
+	pfm_intel_arch_setup_fixed(eax.eax.version,
+				   width,
+				   edx.edx.num_cnt);
+
+	pfm_intel_arch_check_errata();
+
+	pfm_intel_arch_version = eax.eax.version;
+
+	return 0;
+}
+
+/**
+ * pfm_intel_arch_has_ovfls - check for pending overflow condition
+ * @ctx: context to work on
+ *
+ * detect if counters have overflowed.
+ * return:
+ * 	0 : no overflow
+ * 	1 : at least one overflow
+ */
+static int __kprobes pfm_intel_arch_has_ovfls(struct pfm_context *ctx)
+{
+	u64 *cnt_mask;
+	u64 wmask, val;
+	u16 i, num;
+
+	cnt_mask = ctx->regs.cnt_pmds;
+	num = ctx->regs.num_counters;
+	wmask = 1ULL << pfm_pmu_conf->counter_width;
+
+	/*
+	 * we can leverage the fact that we know the mapping
+	 * to hardcode the MSR address and avoid accessing
+	 * more cachelines
+	 *
+	 * We need to check cnt_mask because not all registers
+	 * may be available.
+	 */
+	for (i = 0; num; i++) {
+		if (pfm_arch_bv_test_bit(i, cnt_mask)) {
+			rdmsrl(pfm_intel_arch_pmd_desc[i].hw_addr, val);
+			if (!(val & wmask))
+				return 1;
+			num--;
+		}
+	}
+	return 0;
+}
+
+static int pfm_intel_arch_stop_save(struct pfm_context *ctx,
+				    struct pfm_event_set *set)
+{
+	u64 used_mask[PFM_PMC_BV];
+	u64 val, wmask, ovfl_mask;
+	u32 i, count;
+
+	wmask = 1ULL << pfm_pmu_conf->counter_width;
+
+	pfm_arch_bv_and(used_mask,
+			set->used_pmcs,
+			enable_mask,
+			max_enable);
+
+	count = pfm_arch_bv_weight(used_mask, max_enable);
+
+	/*
+	 * stop monitoring
+	 * Unfortunately, this is very expensive!
+	 * wrmsrl() is serializing.
+	 */
+	for (i = 0; count; i++) {
+		if (pfm_arch_bv_test_bit(i, used_mask)) {
+			wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0);
+			count--;
+		}
+	}
+
+	/*
+	 * if we already having a pending overflow condition, we simply
+	 * return to take care of this first.
+	 */
+	if (set->npend_ovfls)
+		return 1;
+
+	ovfl_mask = pfm_pmu_conf->ovfl_mask;
+
+	/*
+	 * check for pending overflows and save PMDs (combo)
+	 * we employ used_pmds because we also need to save
+	 * and not just check for pending interrupts.
+	 *
+	 * all pmds are counters
+	 */
+	count = set->nused_pmds;
+	for (i = 0; count; i++) {
+		if (pfm_arch_bv_test_bit(i, set->used_pmds)) {
+			val = pfm_arch_read_pmd(ctx, i);
+			if (!(val & wmask)) {
+				pfm_arch_bv_set_bit(i, set->povfl_pmds);
+				set->npend_ovfls++;
+			}
+			val = (set->pmds[i] & ~ovfl_mask)
+				| (val & ovfl_mask);
+			set->pmds[i] = val;
+			count--;
+		}
+	}
+	/* 0 means: no need to save PMDs at upper level */
+	return 0;
+}
+
+/**
+ * pfm_intel_arch_quiesce - stop monitoring without grabbing any lock
+ *
+ * called from NMI interrupt handler to immediately stop monitoring
+ * cannot grab any lock, including perfmon related locks
+ */
+static void __kprobes pfm_intel_arch_quiesce(void)
+{
+	u16 i;
+
+	/*
+	 * PMC16 is the fixed control register so it has a
+	 * distinct MSR address
+	 *
+	 * We do not use the hw_addr field in the table to avoid touching
+	 * too many cachelines
+	 */
+	for (i = 0; i < pfm_pmu_conf->regs_all.max_pmc; i++) {
+		if (pfm_arch_bv_test_bit(i, pfm_pmu_conf->regs_all.pmcs)) {
+			if (i == 16)
+				wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
+			else
+				wrmsrl(MSR_P6_EVNTSEL0+i, 0);
+		}
+	}
+}
+/**
+* pfm_intel_arch_release_pmu_percpu - release PMU resource per CPU
+*
+* Since v2, there exists global control MSR, to start/stop and
+* also collect overflow status information. In particular,
+* GLOBAL_CTRL controls start/stop and has one bit per counter.
+* To maintain backward compatibility with v1, the power-on value
+* of GLOBAL_CTRL should be such that generic counters are enabled
+* but fixed counters are disabled (true on Penryn and Atom currently).
+*
+* Here, we simply make sure that all available counters are enabled.
+* After that, start/stop is controlled on a per-counter basis.
+*/
+static void pfm_intel_arch_acquire_pmu_percpu(void)
+{
+	struct pfm_regmap_desc *d;
+	u64 mask = 0;
+	unsigned int i;
+
+	/* nothing to do for v1 */
+	if (pfm_intel_arch_version < 2)
+		return;
+
+	/*
+	 * build bitmask of registers that are available to
+	 * us. In some cases, there may be fewer registers than
+	 * what the PMU supports due to sharing with other kernel
+	 * subsystems, such as NMI
+	 */
+	d = pfm_pmu_conf->pmd_desc;
+	for (i=0; i < 16; i++) {
+		if ((d[i].type & PFM_REG_I) == 0)
+			continue;
+		mask |= 1ull << i;
+	}
+	for (i=16; i < PFM_IA_MAX_PMDS; i++) {
+		if ((d[i].type & PFM_REG_I) == 0)
+			continue;
+		mask |= 1ull << (32+i-16);
+	}
+	/*
+	 * keep a local copy of the current MSR_CORE_PERF_GLOBAL_CTRL
+	 */
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, __get_cpu_var(saved_global_ctrl));
+
+	PFM_DBG("global=0x%llx set to 0x%llx",
+		__get_cpu_var(saved_global_ctrl),
+		mask);
+	/*
+	 * enable all registers
+	 *
+	 * No need to quiesce PMU. If there is a overflow, it will be
+	 * treated as spurious by the handler
+	 */
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, mask);
+}
+
+/**
+* pfm_intel_arch_release_pmu_percpu - release PMU resource per CPU
+*
+* Since v2, there exists global control MSR, to start/stop and
+* also collect overflow status information. In particular,
+* GLOBAL_CTRL controls start/stop and has one bit per counter.
+* To maintain backward compatibility with v1, the power-on value
+* of GLOBAL_CTRL should be such that generic counters are enabled
+* but fixed counters are disabled (true on Penryn and Atom currently).
+*
+* Here, we are done using the PMU. so we restore the power-on value.
+*/
+static void pfm_intel_arch_release_pmu_percpu(void)
+{
+	/* nothing to do for v1 */
+	if (pfm_intel_arch_version < 2)
+		return;
+
+	PFM_DBG("global_ctrl restored to 0x%llx\n",
+		__get_cpu_var(saved_global_ctrl));
+
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, __get_cpu_var(saved_global_ctrl));
+}
+
+/*
+ * Counters may have model-specific width. Yet the documentation says
+ * that only the lower 32 bits can be written to due to the specification
+ * of wrmsr. bits [32-(w-1)] are sign extensions of bit 31. Bits [w-63] must
+ * not be set (see rsvd_msk for PMDs). As such the effective width of a
+ * counter is 31 bits only regardless of what CPUID.0xa returns.
+ *
+ * See IA-32 Intel Architecture Software developer manual Vol 3B chapter 18
+ */
+static struct pfm_pmu_config pfm_intel_arch_pmu_conf = {
+	.pmu_name = "Intel architectural",
+	.pmd_desc = pfm_intel_arch_pmd_desc,
+	.counter_width   = 31,
+	.num_pmc_entries = PFM_IA_MAX_PMCS,
+	.num_pmd_entries = PFM_IA_MAX_PMDS,
+	.pmc_desc = pfm_intel_arch_pmc_desc,
+	.version = "1.0",
+	.pmu_info = &pfm_intel_arch_pmu_info
+};
+
+static int __init pfm_intel_arch_pmu_init_module(void)
+{
+	if (pfm_intel_arch_probe_pmu())
+		return -ENOSYS;
+
+	return pfm_pmu_register(&pfm_intel_arch_pmu_conf);
+}
+
+device_initcall(pfm_intel_arch_pmu_init_module);
diff --git a/include/linux/perfmon.h b/include/linux/perfmon.h
new file mode 100644
index 000000000000..6117e605a43b
--- /dev/null
+++ b/include/linux/perfmon.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+
+#ifndef __LINUX_PERFMON_H__
+#define __LINUX_PERFMON_H__
+
+/*
+ * This file contains all the user visible generic definitions for the
+ * interface. Model-specific user-visible definitions are located in
+ * the asm/perfmon.h file.
+ */
+
+/*
+ * include arch-specific user interface definitions
+ */
+#include <asm/perfmon.h>
+
+/*
+ * defined by each arch
+ */
+#define PFM_MAX_PMCS	PFM_ARCH_MAX_PMCS
+#define PFM_MAX_PMDS	PFM_ARCH_MAX_PMDS
+
+/*
+ * number of elements for each type of bitvector
+ * all bitvectors use u64 fixed size type on all architectures.
+ */
+#define PFM_BVSIZE(x)	(((x)+(sizeof(__u64)<<3)-1) / (sizeof(__u64)<<3))
+#define PFM_PMD_BV	PFM_BVSIZE(PFM_MAX_PMDS)
+#define PFM_PMC_BV	PFM_BVSIZE(PFM_MAX_PMCS)
+
+/*
+ * argument to pfm_create
+ * populated on return
+ */
+struct pfarg_sinfo {
+	__u64 sif_avail_pmcs[PFM_PMC_BV];/* out: available PMCs */
+	__u64 sif_avail_pmds[PFM_PMD_BV];/* out: available PMDs */
+	__u64 sif_reserved1[4];		 /* for future use */
+};
+
+/*
+ * PMC and PMD generic register description
+ */
+struct pfarg_pmr {
+	__u16 reg_num;		/* which register */
+	__u16 reg_res1;		/* reserved */
+	__u32 reg_flags;	/* REGFL flags */
+	__u64 reg_value;	/* 64-bit value */
+};
+
+/*
+ * pfm_write, pfm_read type:
+ */
+#define PFM_RW_PMD	0x01 /* accessing PMD registers */
+#define PFM_RW_PMC	0x02 /* accessing PMC registers */
+
+/*
+ * pfm_set_state state:
+ */
+#define PFM_ST_START	0x01 /* start monitoring */
+#define PFM_ST_STOP	0x02 /* stop monitoring */
+
+/*
+ * pfm_attach special target to trigger detach
+ */
+#define PFM_NO_TARGET	-1 /* detach session target */
+
+/*
+ * default value for the user and group security parameters in
+ * /proc/sys/kernel/perfmon/sys_group
+ * /proc/sys/kernel/perfmon/task_group
+ */
+#define PFM_GROUP_PERM_ANY	-1	/* any user/group */
+
+/*
+ * perfmon version number
+ */
+#define PFM_VERSION_MAJ		 3U
+#define PFM_VERSION_MIN		 0U
+#define PFM_VERSION		 (((PFM_VERSION_MAJ&0xffff)<<16)|\
+				  (PFM_VERSION_MIN & 0xffff))
+#define PFM_VERSION_MAJOR(x)	 (((x)>>16) & 0xffff)
+#define PFM_VERSION_MINOR(x)	 ((x) & 0xffff)
+
+#endif /* __LINUX_PERFMON_H__ */
diff --git a/include/linux/perfmon_kern.h b/include/linux/perfmon_kern.h
new file mode 100644
index 000000000000..e21cd835bd2c
--- /dev/null
+++ b/include/linux/perfmon_kern.h
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+
+#ifndef __LINUX_PERFMON_KERN_H__
+#define __LINUX_PERFMON_KERN_H__
+/*
+ * This file contains all the definitions of data structures, variables, macros
+ * that are to be shared between generic code and arch-specific code
+ *
+ * For generic only definitions, use perfmon/perfmon_priv.h
+ */
+#include <linux/file.h>
+#include <linux/sched.h>
+#include <linux/perfmon.h>
+
+#ifdef CONFIG_PERFMON
+
+/*
+ * system adminstrator configuration controls available via
+ * the /sys/kerne/perfmon interface
+ */
+struct pfm_controls {
+	u32	debug;		/* debugging control bitmask */
+	gid_t	task_group;	/* gid to create a per-task context */
+	size_t	arg_mem_max;	/* maximum vector argument size */
+};
+extern struct pfm_controls pfm_controls;
+
+/*
+ * event_set: encapsulates the full PMU state
+ */
+struct pfm_event_set {
+	u16 nused_pmds;			/* max number of used PMDs */
+	u16 nused_pmcs;			/* max number of used PMCs */
+	u32 priv_flags;			/* private flags (see below) */
+	u32 npend_ovfls;		/* number of pending PMD overflow */
+	u32 pad1;			/* padding */
+	u64 used_pmds[PFM_PMD_BV];	/* used PMDs */
+	u64 povfl_pmds[PFM_PMD_BV];	/* pending overflowed PMDs */
+	u64 used_pmcs[PFM_PMC_BV];	/* used PMCs */
+	u64 pmcs[PFM_MAX_PMCS];		/* PMC values */
+	u64 pmds[PFM_MAX_PMDS];		/* PMD values */
+};
+
+/*
+ * common private event set flags (priv_flags)
+ *
+ * upper 16 bits: for arch-specific use
+ * lower 16 bits: for common use
+ */
+#define PFM_SETFL_PRIV_MOD_PMDS 0x1 /* PMD register(s) modified */
+#define PFM_SETFL_PRIV_MOD_PMCS 0x2 /* PMC register(s) modified */
+#define PFM_SETFL_PRIV_MOD_BOTH	(PFM_SETFL_PRIV_MOD_PMDS \
+				| PFM_SETFL_PRIV_MOD_PMCS)
+
+
+/*
+ * context flags
+ */
+struct pfm_context_flags {
+	unsigned int started:1;		/* pfm_start() issued */
+	unsigned int is_self:1;		/* per-thread and self-montoring */
+	unsigned int work_type:2;	/* type of work for pfm_handle_work */
+	unsigned int reserved:28;	/* for future use */
+};
+/*
+ * values for work_type (TIF_PERFMON_WORK must be set)
+ */
+#define PFM_WORK_NONE	0	/* nothing to do */
+#define PFM_WORK_ZOMBIE	1	/* cleanup zombie context */
+
+
+/*
+ * perfmon context state
+ */
+#define PFM_CTX_UNLOADED	1 /* context is detached */
+#define PFM_CTX_LOADED		2 /* context is attached */
+#define PFM_CTX_ZOMBIE		3 /* context lost owner but still attached */
+
+/*
+ * registers description
+ */
+struct pfm_regdesc {
+	u64 pmcs[PFM_PMC_BV];		/* available PMC */
+	u64 pmds[PFM_PMD_BV];		/* available PMD */
+	u64 rw_pmds[PFM_PMD_BV];	/* available RW PMD */
+	u64 intr_pmds[PFM_PMD_BV];	/* PMD generating intr */
+	u64 cnt_pmds[PFM_PMD_BV];	/* PMD counters */
+	u16 max_pmc;			/* highest+1 avail PMC */
+	u16 max_pmd;			/* highest+1 avail PMD */
+	u16 max_rw_pmd;			/* highest+1 avail RW PMD */
+	u16 first_intr_pmd;		/* first intr PMD */
+	u16 max_intr_pmd;		/* highest+1 intr PMD */
+	u16 num_rw_pmd;			/* number of avail RW PMD */
+	u16 num_pmcs;			/* number of logical PMCS */
+	u16 num_pmds;			/* number of logical PMDS */
+	u16 num_counters;		/* number of counting PMD */
+};
+
+
+/*
+ * context: contains all the state of a session
+ */
+struct pfm_context {
+	spinlock_t		lock;		/* context protection */
+
+	struct pfm_context_flags flags;
+	u32			state;		/* current state */
+	struct task_struct 	*task;		/* attached task */
+
+	u64 			last_act;	/* last activation */
+	u32			last_cpu;   	/* last CPU used (SMP only) */
+
+	struct pfm_event_set	*active_set;	/* active set */
+	struct pfm_event_set	_set0;		/* event set 0 */
+
+	struct pfm_regdesc	regs;		/* registers available to context */
+};
+
+/*
+ * logging
+ */
+#define PFM_ERR(f, x...)  printk(KERN_ERR     "perfmon: " f "\n", ## x)
+#define PFM_WARN(f, x...) printk(KERN_WARNING "perfmon: " f "\n", ## x)
+#define PFM_LOG(f, x...)  printk(KERN_NOTICE  "perfmon: " f "\n", ## x)
+#define PFM_INFO(f, x...) printk(KERN_INFO    "perfmon: " f "\n", ## x)
+
+/*
+ * debugging
+ *
+ * Printk rate limiting is enforced to avoid getting flooded with too many
+ * error messages on the console (which could render the machine unresponsive).
+ * To get full debug output (turn off ratelimit):
+ * 	$ echo 0 >/proc/sys/kernel/printk_ratelimit
+ *
+ * debug is a bitmask where bits are defined as follows:
+ * bit  0: enable non-interrupt code degbug messages
+ * bit  1: enable interrupt code debug messages
+ */
+#ifdef CONFIG_PERFMON_DEBUG
+#define _PFM_DBG(lm, f, x...) \
+	do { \
+		if (unlikely((pfm_controls.debug & lm) && printk_ratelimit())) { \
+			printk("perfmon: %s.%d: CPU%d [%d]: " f "\n", \
+			       __func__, __LINE__, \
+			       smp_processor_id(), current->pid , ## x); \
+		} \
+	} while (0)
+
+#define PFM_DBG(f, x...) _PFM_DBG(0x1, f, ##x)
+#define PFM_DBG_ovfl(f, x...) _PFM_DBG(0x2, f, ##x)
+#else
+#define PFM_DBG(f, x...)	do {} while (0)
+#define PFM_DBG_ovfl(f, x...)	do {} while (0)
+#endif
+
+extern struct pfm_pmu_config  *pfm_pmu_conf;
+extern int perfmon_disabled;
+
+static inline struct pfm_arch_context *pfm_ctx_arch(struct pfm_context *c)
+{
+	return (struct pfm_arch_context *)(c+1);
+}
+
+#include <linux/perfmon_pmu.h>
+
+extern const struct file_operations pfm_file_ops;
+
+void pfm_handle_work(struct pt_regs *regs);
+void __pfm_exit_thread(void);
+void pfm_ctxsw_in(struct task_struct *prev, struct task_struct *next);
+void pfm_ctxsw_out(struct task_struct *prev, struct task_struct *next);
+void __pfm_init_percpu(void *dummy);
+
+void pfm_interrupt_handler(unsigned long ip, struct pt_regs *regs);
+
+int pfm_session_allcpus_acquire(void);
+void pfm_session_allcpus_release(void);
+
+static inline void pfm_exit_thread(void)
+{
+	if (current->pfm_context)
+		__pfm_exit_thread();
+}
+
+/*
+ * include arch-specific kernel level definitions
+ */
+#include <asm/perfmon_kern.h>
+
+static inline void pfm_copy_thread(struct task_struct *task)
+{
+	/*
+	 * context or perfmon TIF state  is NEVER inherited
+	 * in child task. Holds for per-thread and system-wide
+	 */
+	task->pfm_context = NULL;
+	clear_tsk_thread_flag(task, TIF_PERFMON_CTXSW);
+}
+
+/*
+ * read a single PMD register.
+ */
+static inline u64 pfm_read_pmd(struct pfm_context *ctx, unsigned int cnum)
+{
+	return pfm_arch_read_pmd(ctx, cnum);
+}
+/*
+ * write a single PMD register.
+ */
+static inline void pfm_write_pmd(struct pfm_context *ctx, unsigned int cnum,
+				 u64 value)
+{
+	/*
+	 * PMD writes are ignored for read-only registers
+	 */
+	if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_RO)
+		return;
+
+	/*
+	 * clear unimplemented bits
+	 */
+	value &= ~pfm_pmu_conf->pmd_desc[cnum].rsvd_msk;
+
+	pfm_arch_write_pmd(ctx, cnum, value);
+}
+
+DECLARE_PER_CPU(struct pfm_context *, pmu_ctx);
+DECLARE_PER_CPU(struct task_struct *, pmu_owner);
+
+/*
+ * number of u64 to use for stack buffer in
+ * syscalls which take vector argument
+ */
+#ifndef PFM_ARCH_STK_ARG
+#define PFM_ARCH_STK_ARG	2
+#endif
+
+#define PFM_STK_ARG	PFM_ARCH_STK_ARG
+
+#else /* !CONFIG_PERFMON */
+/*
+ * perfmon hooks are nops when CONFIG_PERFMON is undefined
+ */
+
+static inline void pfm_exit_thread(void)
+{}
+
+static inline void pfm_handle_work(struct pt_regs *regs)
+{}
+
+static inline void pfm_copy_thread(struct task_struct *t)
+{}
+
+static inline void pfm_ctxsw_in(struct task_struct *p, struct task_struct *n)
+{}
+
+static inline void pfm_ctxsw_out(struct task_struct *p, struct task_struct *n)
+{}
+
+static inline void pfm_session_allcpus_release(void)
+{}
+
+static inline int pfm_session_allcpus_acquire(void)
+{
+	return 0;
+}
+#endif /* CONFIG_PERFMON */
+#endif /* __LINUX_PERFMON_KERN_H__ */
diff --git a/include/linux/perfmon_pmu.h b/include/linux/perfmon_pmu.h
new file mode 100644
index 000000000000..13d357140243
--- /dev/null
+++ b/include/linux/perfmon_pmu.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Interface for PMU description modules
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#ifndef __PERFMON_PMU_H__
+#define __PERFMON_PMU_H__ 1
+
+/*
+ * generic information about a PMC or PMD register
+ */
+struct pfm_regmap_desc {
+	u16  type;		/* register infos */
+	u16  reserved1;		/* for future use */
+	u32  reserved2;		/* for future use */
+	u64  dfl_val;		/* power-on default value (quiescent) */
+	u64  rsvd_msk;		/* reserved bits: 1 means reserved */
+	u64  no_emul64_msk;	/* bits to clear for PFM_REGFL_NO_EMUL64 */
+	unsigned long hw_addr;	/* HW register address or index */
+	struct kobject	kobj;	/* for internal use only */
+	char *desc;		/* HW register description string */
+	u64 dep_pmcs[PFM_PMC_BV];/* depending PMC registers */
+};
+
+/*
+ * pfm_reg_desc helper macros
+ */
+#define PMC_D(t, d, v, r, n, h) \
+	{ .type = t,          \
+	  .desc = d,          \
+	  .dfl_val = v,       \
+	  .rsvd_msk = r,      \
+	  .no_emul64_msk = n, \
+	  .hw_addr = h	      \
+	}
+
+#define PMD_D(t, d, h)        \
+	{ .type = t,          \
+	  .desc = d,          \
+	  .rsvd_msk = 0,      \
+	  .no_emul64_msk = 0, \
+	  .hw_addr = h	      \
+	}
+
+#define PMD_DR(t, d, h, r)    \
+	{ .type = t,          \
+	  .desc = d,          \
+	  .rsvd_msk = r,      \
+	  .no_emul64_msk = 0, \
+	  .hw_addr = h	      \
+	}
+
+#define PMX_NA \
+	{ .type = PFM_REG_NA }
+
+/*
+ * type of a PMU register (16-bit bitmask) for use with pfm_reg_desc.type
+ */
+#define PFM_REG_NA	0x00  /* not avail. (not impl.,no access) must be 0 */
+#define PFM_REG_I	0x01  /* PMC/PMD: implemented */
+#define PFM_REG_WC	0x02  /* PMC: has write_checker */
+#define PFM_REG_C64	0x04  /* PMD: 64-bit virtualization */
+#define PFM_REG_RO	0x08  /* PMD: read-only (writes ignored) */
+#define PFM_REG_INTR	0x20  /* PMD: register can generate interrupt */
+#define PFM_REG_NO64	0x100 /* PMC: supports PFM_REGFL_NO_EMUL64 */
+
+/*
+ * define some shortcuts for common types
+ */
+#define PFM_REG_W	(PFM_REG_WC|PFM_REG_I)
+#define PFM_REG_W64	(PFM_REG_WC|PFM_REG_NO64|PFM_REG_I)
+#define PFM_REG_C	(PFM_REG_C64|PFM_REG_INTR|PFM_REG_I)
+#define PFM_REG_I64	(PFM_REG_NO64|PFM_REG_I)
+#define PFM_REG_IRO	(PFM_REG_I|PFM_REG_RO)
+
+typedef int (*pfm_pmc_check_t)(struct pfm_context *ctx,
+			       struct pfm_event_set *set,
+			       struct pfarg_pmr *req);
+
+typedef int (*pfm_pmd_check_t)(struct pfm_context *ctx,
+			       struct pfm_event_set *set,
+			       struct pfarg_pmr *req);
+
+/*
+ * structure used by pmu description modules
+ *
+ * probe_pmu() routine return value:
+ * 	- 1 means recognized PMU
+ * 	- 0 means not recognized PMU
+ */
+struct pfm_pmu_config {
+	char *pmu_name;				/* PMU family name */
+	char *version;				/* config module version */
+
+	int counter_width;			/* width of hardware counter */
+
+	struct pfm_regmap_desc	*pmc_desc;	/* PMC register descriptions */
+	struct pfm_regmap_desc	*pmd_desc;	/* PMD register descriptions */
+
+	pfm_pmc_check_t		pmc_write_check;/* write checker (optional) */
+	pfm_pmd_check_t		pmd_write_check;/* write checker (optional) */
+	pfm_pmd_check_t		pmd_read_check;	/* read checker (optional) */
+
+	u16			num_pmc_entries;/* #entries in pmc_desc */
+	u16			num_pmd_entries;/* #entries in pmd_desc */
+	void			*pmu_info;	/* model-specific infos */
+	/*
+	 * fields computed internally, do not set in module
+	 */
+	struct pfm_regdesc	regs_all;	/* regs available to all */
+	u64			ovfl_mask;	/* overflow mask */
+};
+
+static inline void *pfm_pmu_info(void)
+{
+	return pfm_pmu_conf->pmu_info;
+}
+
+int pfm_pmu_register(struct pfm_pmu_config *cfg);
+
+int pfm_sysfs_add_pmu(struct pfm_pmu_config *pmu);
+
+#endif /* __PERFMON_PMU_H__ */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 755b5705cd38..8e23536e66be 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1306,6 +1306,10 @@ struct task_struct {
 	unsigned long default_timer_slack_ns;
 
 	struct list_head	*scm_work_list;
+
+#if defined(CONFIG_PERFMON_V20) || defined(CONFIG_PERFMON)
+	struct pfm_context *pfm_context;
+#endif
 };
 
 /*
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d6ff145919ca..d12a175e0f43 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -625,4 +625,15 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
+#ifdef CONFIG_PERFMON_V20
+struct pfarg_sinfo;
+asmlinkage long sys_pfm_create(int flags, struct pfarg_sinfo *s,
+			       char  __user *f, void __user *uarg, size_t uarg_size);
+
+asmlinkage long sys_pfm_write(int fd, int flags, int type, void __user *arg, size_t s);
+asmlinkage long sys_pfm_read(int fd, int flags, int type, void __user *arg, size_t s);
+asmlinkage long sys_pfm_attach(int fd, int flags, int target);
+asmlinkage long sys_pfm_set_state(int fd, int flags, int state);
+#endif /* CONFIG_PERFMON_V20 */
+
 #endif
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index a77b27b11b04..1432b300e1ca 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -174,3 +174,10 @@ cond_syscall(compat_sys_timerfd_settime);
 cond_syscall(compat_sys_timerfd_gettime);
 cond_syscall(sys_eventfd);
 cond_syscall(sys_eventfd2);
+
+/* perfmon */
+cond_syscall(sys_pfm_create);
+cond_syscall(sys_pfm_write);
+cond_syscall(sys_pfm_read);
+cond_syscall(sys_pfm_attach);
+cond_syscall(sys_pfm_set_state);
diff --git a/perfmon/Makefile b/perfmon/Makefile
new file mode 100644
index 000000000000..4ee61aa50675
--- /dev/null
+++ b/perfmon/Makefile
@@ -0,0 +1,10 @@
+#
+# Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
+# Contributed by Stephane Eranian <eranian@gmail.com>
+#
+obj-y =	perfmon_ctx.o perfmon_file.o      \
+	perfmon_attach.o perfmon_res.o    \
+	perfmon_init.o perfmon_activate.o \
+	perfmon_intr.o perfmon_rw.o	  \
+	perfmon_ctxsw.o perfmon_pmu.o	  \
+	perfmon_syscalls.o perfmon_sysfs.o
diff --git a/perfmon/perfmon_activate.c b/perfmon/perfmon_activate.c
new file mode 100644
index 000000000000..9398e7c15215
--- /dev/null
+++ b/perfmon/perfmon_activate.c
@@ -0,0 +1,136 @@
+/*
+ * perfmon_activate.c: perfmon2 start/stop functions
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *                David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * 	http://www.hpl.hp.com/research/linux/perfmon
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/perfmon_kern.h>
+#include "perfmon_priv.h"
+
+/**
+ * __pfm_start - activate monitoring
+ * @ctx: context to operate on
+ * @start: pfarg_start as passed by user
+ *
+ * When operating in per-thread mode and not self-monitoring, the monitored
+ * thread must be stopped. Activation will be effective next time the thread
+ * is context switched in.
+ *
+ * The pfarg_start argument is optional and may be used to designate
+ * the initial event set to activate. When not provided, the last active
+ * set is used. For the first activation, set0 is used when start is NULL.
+ *
+ * On some architectures, e.g., IA-64, it may be possible to start monitoring
+ * without calling this function under certain conditions (per-thread and self
+ * monitoring). In this case, either set0 or the last active set is used.
+ *
+ * the context is locked and interrupts are disabled.
+ */
+int __pfm_start(struct pfm_context *ctx)
+{
+	struct task_struct *task;
+	struct pfm_event_set *set;
+
+	task = ctx->task;
+
+	/*
+	 * UNLOADED: error
+	 * LOADED  : normal start, nop if started
+	 * ZOMBIE  : cannot happen
+	 */
+	if (ctx->state == PFM_CTX_UNLOADED)
+		return -EINVAL;
+
+	set = ctx->active_set;
+
+	/*
+	 * mark as started
+	 * must be done before calling pfm_arch_start()
+	 */
+	ctx->flags.started = 1;
+
+	pfm_arch_start(task, ctx);
+
+	/*
+	 * we check whether we had a pending ovfl before restarting.
+	 * If so we need to regenerate the interrupt to make sure we
+	 * keep recorded samples. For non-self monitoring this check
+	 * is done in the pfm_ctxswin_thread() routine.
+	 *
+	 * we check new_set/old_set because pfm_switch_sets() already
+	 * takes care of replaying the pending interrupts
+	 */
+	if (task == current && set->npend_ovfls)
+		pfm_arch_resend_irq(ctx);
+
+	return 0;
+}
+
+/**
+ * __pfm_stop - stop monitoring
+ * @ctx: context to operate on
+ *
+ * When operating in per-thread* mode and when not self-monitoring,
+ * the monitored thread must be stopped.
+ *
+ * the context is locked and interrupts are disabled.
+ */
+int __pfm_stop(struct pfm_context *ctx)
+{
+	struct task_struct *task;
+
+	/*
+	 * context must be attached (zombie cannot happen)
+	 */
+	if (ctx->state == PFM_CTX_UNLOADED)
+		return -EINVAL;
+
+	task = ctx->task;
+
+	PFM_DBG("ctx_task=[%d] ctx_state=%d is_system=%d",
+		task ? task->pid : -1,
+		ctx->state,
+		!task);
+
+	pfm_arch_stop(task, ctx);
+
+	ctx->flags.started = 0;
+	/*
+	 * starting now, in-flight PMU interrupt for this context
+	 * are treated as spurious
+	 */
+	return 0;
+}
diff --git a/perfmon/perfmon_attach.c b/perfmon/perfmon_attach.c
new file mode 100644
index 000000000000..4ef00982f218
--- /dev/null
+++ b/perfmon/perfmon_attach.c
@@ -0,0 +1,337 @@
+/*
+ * perfmon_attach.c: perfmon2 load/unload functions
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *                David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * 	http://www.hpl.hp.com/research/linux/perfmon
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/perfmon_kern.h>
+#include "perfmon_priv.h"
+
+/**
+ * __pfm_load_ctx_thread - attach context to a thread
+ * @ctx: context to operate on
+ * @task: thread to attach to
+ *
+ * The function must be called with the context locked and interrupts disabled.
+ */
+static int pfm_load_ctx_thread(struct pfm_context *ctx,
+			       struct task_struct *task)
+{
+	struct pfm_event_set *set;
+	struct pfm_context *old;
+	int ret;
+	u16 max;
+
+	PFM_DBG("pid=%d",  task->pid);
+
+	/*
+	 * we must use cmpxchg to avoid race condition with another
+	 * context trying to attach to the same task.
+	 *
+	 * per-thread:
+	 *   - task to attach to is checked in sys_pfm_load_context() to avoid
+	 *     locking issues. if found, and not self,  task refcount was
+	 *     incremented.
+	 */
+	old = cmpxchg(&task->pfm_context, NULL, ctx);
+	if (old) {
+		PFM_DBG("load_pid=%d has a context "
+			"old=%p new=%p cur=%p",
+			task->pid,
+			old,
+			ctx,
+			task->pfm_context);
+		return -EEXIST;
+	}
+
+	/*
+	 * initialize sets
+	 */
+	set = ctx->active_set;
+
+	/*
+	 * cleanup bitvectors
+	 */
+	max = ctx->regs.max_intr_pmd;
+	pfm_arch_bv_zero(set->povfl_pmds, max);
+
+	set->npend_ovfls = 0;
+
+	/*
+	 * we cannot just use plain clear because of arch-specific flags
+	 */
+	set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH;
+
+	/*
+ 	 * link context to task
+ 	 */
+	ctx->task = task;
+
+	/*
+	 * perform any architecture specific actions
+	 */
+	ret = pfm_arch_load_context(ctx);
+	if (ret)
+		goto error_noload;
+
+	/*
+	 * now reserve the session, before we can proceed with
+	 * actually accessing the PMU hardware
+	 */
+	ret = pfm_session_acquire();
+	if (ret)
+		goto error;
+
+
+	if (ctx->task != current) {
+
+		/* not self-monitoring */
+		ctx->flags.is_self = 0;
+
+		/* force a full reload */
+		ctx->last_act = PFM_INVALID_ACTIVATION;
+		ctx->last_cpu = -1;
+		set->priv_flags |= PFM_SETFL_PRIV_MOD_BOTH;
+
+	} else {
+		/*
+ 		 * on UP, we may have to push out the PMU
+ 		 * state of the last monitored thread
+ 		 */
+		pfm_check_save_prev_ctx();
+
+		ctx->last_cpu = smp_processor_id();
+		__get_cpu_var(pmu_activation_number)++;
+		ctx->last_act = __get_cpu_var(pmu_activation_number);
+
+		ctx->flags.is_self = 1;
+
+		/*
+		 * load PMD from set
+		 * load PMC from set
+		 */
+		pfm_arch_restore_pmds(ctx, set);
+		pfm_arch_restore_pmcs(ctx, set);
+
+		/*
+		 * set new ownership
+		 */
+		pfm_set_pmu_owner(ctx->task, ctx);
+	}
+
+	/*
+ 	 * will cause switch_to() to invoke PMU
+ 	 * context switch code
+ 	 */
+	set_tsk_thread_flag(task, TIF_PERFMON_CTXSW);
+
+	ctx->state = PFM_CTX_LOADED;
+
+	return 0;
+
+error:
+	pfm_arch_unload_context(ctx);
+	ctx->task = NULL;
+error_noload:
+	/*
+	 * detach context
+	 */
+	task->pfm_context = NULL;
+	return ret;
+}
+
+/**
+ * __pfm_load_context - attach context to a thread
+ * @ctx: context to operate on
+ * @task: thread to attach to
+ */
+int __pfm_load_context(struct pfm_context *ctx, struct task_struct *task)
+{
+	return pfm_load_ctx_thread(ctx, task);
+}
+
+/**
+ * pfm_update_ovfl_pmds - account for pending ovfls on PMDs
+ * @ctx: context to operate on
+ *
+ * This function is always called after pfm_stop has been issued
+ */
+static void pfm_update_ovfl_pmds(struct pfm_context *ctx)
+{
+	struct pfm_event_set *set;
+	u64 *cnt_pmds;
+	u64 ovfl_mask;
+	u16 num_ovfls, i;
+
+	ovfl_mask = pfm_pmu_conf->ovfl_mask;
+	cnt_pmds = ctx->regs.cnt_pmds;
+	set = ctx->active_set;
+
+	if (!set->npend_ovfls)
+		return;
+
+	num_ovfls = set->npend_ovfls;
+	PFM_DBG("novfls=%u", num_ovfls);
+
+	for (i = 0; num_ovfls; i++) {
+		if (pfm_arch_bv_test_bit(i, set->povfl_pmds)) {
+			/* only correct value for counters */
+			if (pfm_arch_bv_test_bit(i, cnt_pmds))
+				set->pmds[i] += 1 + ovfl_mask;
+			num_ovfls--;
+		}
+		PFM_DBG("pmd%u val=0x%llx",
+			i,
+			(unsigned long long)set->pmds[i]);
+	}
+	/*
+	 * we need to clear to prevent a pfm_getinfo_evtsets() from
+	 * returning stale data even after the context is unloaded
+	 */
+	set->npend_ovfls = 0;
+	pfm_arch_bv_zero(set->povfl_pmds, ctx->regs.max_intr_pmd);
+}
+
+/**
+ * __pfm_unload_context - detach context from CPU or thread
+ * @ctx: context to operate on
+ *
+ * The function must be called with the context locked and interrupts disabled.
+ */
+int __pfm_unload_context(struct pfm_context *ctx)
+{
+	int ret;
+
+	PFM_DBG("ctx_state=%d task [%d]",
+		ctx->state,
+		ctx->task ? ctx->task->pid : -1);
+
+	/*
+	 * check unload-able state
+	 */
+	if (ctx->state == PFM_CTX_UNLOADED)
+		return -EINVAL;
+
+	/*
+	 * stop monitoring
+	 */
+	ret = __pfm_stop(ctx);
+	if (ret)
+		return ret;
+
+	ctx->state = PFM_CTX_UNLOADED;
+
+	/*
+	 * save active set
+	 * UP:
+	 * 	if not current task and due to lazy, state may
+	 * 	still be live
+	 * for system-wide, guaranteed to run on correct CPU
+	 */
+	if (__get_cpu_var(pmu_ctx) == ctx) {
+		/*
+		 * pending overflows have been saved by pfm_stop()
+		 */
+		pfm_save_pmds(ctx);
+		pfm_set_pmu_owner(NULL, NULL);
+		PFM_DBG("released ownership");
+	}
+
+	/*
+	 * account for pending overflows
+	 */
+	pfm_update_ovfl_pmds(ctx);
+
+	/*
+	 * arch-specific unload operations
+	 */
+	pfm_arch_unload_context(ctx);
+
+	/*
+	 * per-thread: disconnect from monitored task
+	 */
+	if (ctx->task) {
+		ctx->task->pfm_context = NULL;
+		clear_tsk_thread_flag(ctx->task, TIF_PERFMON_CTXSW);
+		ctx->task = NULL;
+	}
+	return 0;
+}
+
+/**
+ * __pfm_exit_thread - detach and free context on thread exit
+ */
+void __pfm_exit_thread(void)
+{
+	struct pfm_context *ctx;
+	unsigned long flags;
+	int free_ok = 0, ret = -1;
+
+	ctx  = current->pfm_context;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	PFM_DBG("state=%d is_self=%d", ctx->state, ctx->flags.is_self);
+
+	/*
+	 * __pfm_unload_context() cannot fail
+	 * in the context states we are interested in
+	 */
+	switch (ctx->state) {
+	case PFM_CTX_LOADED:
+		ret = __pfm_unload_context(ctx);
+		break;
+	case PFM_CTX_ZOMBIE:
+		ret = __pfm_unload_context(ctx);
+		free_ok = 1;
+		break;
+	default:
+		BUG_ON(ctx->state != PFM_CTX_LOADED);
+		break;
+	}
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	if (!ret)
+		pfm_session_release();
+
+	/*
+	 * All memory free operations (especially for vmalloc'ed memory)
+	 * MUST be done with interrupts ENABLED.
+	 */
+	if (free_ok)
+		pfm_free_context(ctx);
+}
diff --git a/perfmon/perfmon_ctx.c b/perfmon/perfmon_ctx.c
new file mode 100644
index 000000000000..985977069a40
--- /dev/null
+++ b/perfmon/perfmon_ctx.c
@@ -0,0 +1,400 @@
+/*
+ * perfmon_ctx.c: perfmon2 context functions
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *                David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * 	http://www.hpl.hp.com/research/linux/perfmon
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/fdtable.h>
+#include <linux/perfmon_kern.h>
+#include "perfmon_priv.h"
+
+/*
+ * context memory pool pointer
+ */
+static struct kmem_cache *pfm_ctx_cachep;
+
+/*
+ * This function is called when we need to perform asynchronous
+ * work on a context. This function is called ONLY when about to
+ * return to user mode (very much like with signal handling).
+ *
+ * we come here if:
+ *
+ *  - we are zombie and we need to cleanup our state
+ *
+ * pfm_handle_work() can be called with interrupts enabled
+ * (TIF_NEED_RESCHED) or disabled.
+ */
+void pfm_handle_work(struct pt_regs *regs)
+{
+	struct pfm_context *ctx;
+	unsigned long flags;
+	int type;
+
+	if (!user_mode(regs))
+		return;
+
+	clear_thread_flag(TIF_PERFMON_WORK);
+
+	ctx = current->pfm_context;
+	if (ctx == NULL) {
+		PFM_DBG("[%d] has no ctx", current->pid);
+		return;
+	}
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	type = ctx->flags.work_type;
+	ctx->flags.work_type = PFM_WORK_NONE;
+
+	PFM_DBG("work_type=%d", type);
+
+	switch (type) {
+	case PFM_WORK_ZOMBIE:
+		goto do_zombie;
+	default:
+		PFM_DBG("unkown type=%d", type);
+		goto nothing_todo;
+	}
+nothing_todo:
+	/*
+	 * restore flags as they were upon entry
+	 */
+	spin_unlock_irqrestore(&ctx->lock, flags);
+	return;
+
+do_zombie:
+	PFM_DBG("context is zombie, bailing out");
+
+	/* always returns 0 in this case */
+	 __pfm_unload_context(ctx);
+
+	/*
+	 * keep the spinlock check happy
+	 */
+	spin_unlock(&ctx->lock);
+
+	/*
+	 * enable interrupt for vfree()
+	 */
+	local_irq_enable();
+
+	/*
+	 * actual context free
+	 */
+	pfm_free_context(ctx);
+
+	/*
+	 * restore interrupts as they were upon entry
+	 */
+	local_irq_restore(flags);
+
+	/*
+	 * pfm_unload always successful, so can release
+	 * session safely
+	 */
+	pfm_session_release();
+}
+
+/**
+ * pfm_free_context - de-allocate context and associated resources
+ * @ctx: context to free
+ */
+void pfm_free_context(struct pfm_context *ctx)
+{
+	pfm_arch_context_free(ctx);
+
+	PFM_DBG("free ctx @0x%p", ctx);
+	kmem_cache_free(pfm_ctx_cachep, ctx);
+	/*
+	 * decrease refcount on:
+	 * 	- PMU description table
+	 */
+	pfm_pmu_release();
+}
+
+/**
+ * pfm_init_ctx -- initialize context SLAB
+ *
+ * called from pfm_init
+ */
+int __init pfm_init_ctx(void)
+{
+	pfm_ctx_cachep = kmem_cache_create("pfm_context",
+				   sizeof(struct pfm_context)+PFM_ARCH_CTX_SIZE,
+				   SLAB_HWCACHE_ALIGN, 0, NULL);
+	if (!pfm_ctx_cachep) {
+		PFM_ERR("cannot initialize context slab");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+/**
+ * pfm_ctx_permissions - check authorization to create new context
+ * @ctx_flags: context flags passed by user
+ *
+ * check for permissions to create a context.
+ *
+ * A sysadmin may decide to restrict creation of per-thread
+ * context to a group of users using the group id via
+ * /sys/kernel/perfmon/task_group
+ *
+ * Once we identify a user level package which can be used
+ * to grant/revoke Linux capabilites at login via PAM, we will
+ * be able to use capabilities. We would also need to increase
+ * the size of cap_t to support more than 32 capabilities (it
+ * is currently defined as u32 and 32 capabilities are alrady
+ * defined).
+ */
+static inline int pfm_ctx_permissions(u32 ctx_flags)
+{
+	if (pfm_controls.task_group != PFM_GROUP_PERM_ANY
+		   && !in_group_p(pfm_controls.task_group)) {
+		PFM_DBG("user group not allowed to create a task context");
+		return -EPERM;
+	}
+	return 0;
+}
+
+/**
+ * pfm_create_initial_set - create initial set from __pfm_c reate_context
+ * @ctx: context to atatched the set to
+ */
+static void pfm_create_initial_set(struct pfm_context *ctx)
+{
+	struct pfm_event_set *set;
+	u64 *impl_pmcs;
+	u16 i, max_pmc;
+
+	set = ctx->active_set;
+	max_pmc = ctx->regs.max_pmc;
+	impl_pmcs =  ctx->regs.pmcs;
+
+	/*
+	 * install default values for all PMC  registers
+	 */
+	for (i = 0; i < max_pmc; i++) {
+		if (pfm_arch_bv_test_bit(i, impl_pmcs)) {
+			set->pmcs[i] = pfm_pmu_conf->pmc_desc[i].dfl_val;
+			PFM_DBG("pmc%u=0x%llx",
+				i,
+				(unsigned long long)set->pmcs[i]);
+		}
+	}
+	/*
+	 * PMD registers are set to 0 when the event set is allocated,
+	 * hence we do not need to explicitly initialize them.
+	 *
+	 * For virtual PMD registers (i.e., those tied to a SW resource)
+	 * their value becomes meaningful once the context is attached.
+	 */
+}
+
+/**
+ * __pfm_create_context - allocate and initialize a perfmon context
+ * @ctx_flags : user context flags
+ * @sif: pointer to pfarg_sinfo to be updated
+ * @new_ctx: will contain new context address on return
+ *
+ * function used to allocate a new context. A context is allocated along
+ * with the default event set. If a sampling format is used, the buffer
+ * may be allocated and initialized.
+ *
+ * The file descriptor identifying the context is allocated and returned
+ * to caller.
+ *
+ * This function operates with no locks and interrupts are enabled.
+ * return:
+ * 	>=0: the file descriptor to identify the context
+ * 	<0 : the error code
+ */
+int __pfm_create_context(__u32 ctx_flags,
+			 struct pfarg_sinfo *sif,
+			 struct pfm_context **new_ctx)
+{
+	struct pfm_context *ctx;
+	struct file *filp = NULL;
+	int fd = 0, ret = -EINVAL;
+
+	if (!pfm_pmu_conf)
+		return -ENOSYS;
+
+	/* no context flags supported yet */
+	if (ctx_flags)
+		goto error_alloc;
+
+	ret = pfm_ctx_permissions(ctx_flags);
+	if (ret < 0)
+		goto error_alloc;
+
+	/*
+	 * we can use GFP_KERNEL and potentially sleep because we do
+	 * not hold any lock at this point.
+	 */
+	might_sleep();
+	ret = -ENOMEM;
+	ctx = kmem_cache_zalloc(pfm_ctx_cachep, GFP_KERNEL);
+	if (!ctx)
+		goto error_alloc;
+
+	PFM_DBG("alloc ctx @0x%p", ctx);
+
+	ctx->active_set = &ctx->_set0;
+
+	spin_lock_init(&ctx->lock);
+
+	/*
+	 * context is unloaded
+	 */
+	ctx->state = PFM_CTX_UNLOADED;
+
+
+	ret = pfm_pmu_acquire(ctx);
+	if (ret)
+		goto error_file;
+	/*
+	 * check if PMU is usable
+	 */
+	if (!(ctx->regs.num_pmcs && ctx->regs.num_pmcs)) {
+		PFM_DBG("no usable PMU registers");
+		ret = -EBUSY;
+		goto error_file;
+	}
+
+	ret = -ENFILE;
+	fd = pfm_alloc_fd(&filp);
+	if (fd < 0)
+		goto error_file;
+
+	/*
+	 * initialize arch-specific section
+	 * must be done before fmt_init()
+	 */
+	ret = pfm_arch_context_create(ctx, ctx_flags);
+	if (ret)
+		goto error_set;
+
+	ret = -ENOMEM;
+
+	/*
+	 * add initial set
+	 */
+	pfm_create_initial_set(ctx);
+
+	filp->private_data = ctx;
+
+	ctx->last_act = PFM_INVALID_ACTIVATION;
+	ctx->last_cpu = -1;
+
+	PFM_DBG("flags=0x%x fd=%d", ctx_flags, fd);
+
+	if (new_ctx)
+		*new_ctx = ctx;
+
+	/*
+	 * copy bitmask of available PMU registers
+	 *
+	 * must copy over the entire vector to avoid
+	 * returning bogus upper bits pass by user
+	 */
+	pfm_arch_bv_copy(sif->sif_avail_pmcs,
+			 ctx->regs.pmcs,
+			 PFM_MAX_PMCS);
+
+	pfm_arch_bv_copy(sif->sif_avail_pmds,
+			 ctx->regs.pmds,
+			 PFM_MAX_PMDS);
+
+	/*
+	 * we defer the fd_install until we are certain the call succeeded
+	 * to ensure we do not have to undo its effect. Neither put_filp()
+	 * nor put_unused_fd() undoes the effect of fd_install().
+	 */
+	fd_install(fd, filp);
+
+	return fd;
+
+error_set:
+	put_filp(filp);
+	put_unused_fd(fd);
+error_file:
+	/*
+	 * calls the right *_put() functions
+	 * calls pfm_release_pmu()
+	 */
+	pfm_free_context(ctx);
+	return ret;
+error_alloc:
+	return ret;
+}
+
+/**
+ * pfm_undo_create -- undo context creation
+ * @fd: file descriptor to close
+ * @ctx: newly created context
+ *
+ * upon return neither fd nor ctx are useable
+ */
+void pfm_undo_create(int fd, struct pfm_context *ctx)
+{
+       struct files_struct *files = current->files;
+       struct file *file;
+       int fput_needed;
+
+       file = fget_light(fd, &fput_needed);
+       /*
+	* there is no fd_uninstall(), so we do it
+	* here. put_unused_fd() does not remove the
+	* effect of fd_install().
+	*/
+
+       spin_lock(&files->file_lock);
+       files->fd_array[fd] = NULL;
+       spin_unlock(&files->file_lock);
+
+       fput_light(file, fput_needed);
+
+       /*
+	* decrement ref count and kill file
+	*/
+       put_filp(file);
+
+       put_unused_fd(fd);
+
+       pfm_free_context(ctx);
+}
diff --git a/perfmon/perfmon_ctxsw.c b/perfmon/perfmon_ctxsw.c
new file mode 100644
index 000000000000..b1086f6dca31
--- /dev/null
+++ b/perfmon/perfmon_ctxsw.c
@@ -0,0 +1,252 @@
+/*
+ * perfmon_cxtsw.c: perfmon2 context switch code
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@gmail.com>
+ *                David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * 	http://perfmon2.sf.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/perfmon_kern.h>
+#include "perfmon_priv.h"
+
+void pfm_save_pmds(struct pfm_context *ctx)
+{
+	struct pfm_event_set *set;
+	u64 val, ovfl_mask;
+	u64 *used_pmds, *cnt_pmds;
+	u16 i, num;
+
+	set = ctx->active_set;
+	ovfl_mask = pfm_pmu_conf->ovfl_mask;
+	num = set->nused_pmds;
+	cnt_pmds = ctx->regs.cnt_pmds;
+	used_pmds = set->used_pmds;
+
+	/*
+	 * save HW PMD, for counters, reconstruct 64-bit value
+	 */
+	for (i = 0; num; i++) {
+		if (pfm_arch_bv_test_bit(i, used_pmds)) {
+			val = pfm_read_pmd(ctx, i);
+			if (likely(pfm_arch_bv_test_bit(i, cnt_pmds)))
+				val = (set->pmds[i] & ~ovfl_mask) |
+					(val & ovfl_mask);
+			set->pmds[i] = val;
+			num--;
+		}
+	}
+}
+
+/*
+ * interrupts are  disabled (no preemption)
+ */
+void __pfm_ctxswin_thread(struct task_struct *task,
+			  struct pfm_context *ctx)
+{
+	u64 cur_act;
+	struct pfm_event_set *set;
+	int reload_pmcs, reload_pmds;
+	int mycpu, is_active;
+
+	mycpu = smp_processor_id();
+
+	cur_act = __get_cpu_var(pmu_activation_number);
+	/*
+	 * we need to lock context because it could be accessed
+	 * from another CPU. Normally the schedule() functions
+	 * has masked interrupts which should be enough to
+	 * protect against PMU interrupts.
+	 */
+	spin_lock(&ctx->lock);
+
+	is_active = pfm_arch_is_active(ctx);
+
+	set = ctx->active_set;
+
+	/*
+	 * in case fo zombie, we do not complete ctswin of the
+	 * PMU, and we force a call to pfm_handle_work() to finish
+	 * cleanup, i.e., free context + smpl_buff. The reason for
+	 * deferring to pfm_handle_work() is that it is not possible
+	 * to vfree() with interrupts disabled.
+	 */
+	if (unlikely(ctx->state == PFM_CTX_ZOMBIE)) {
+		pfm_post_work(task, ctx, PFM_WORK_ZOMBIE);
+		goto done;
+	}
+
+	/*
+	 * if we were the last user of the PMU on that CPU,
+	 * then nothing to do except restore psr
+	 */
+	if (ctx->last_cpu == mycpu && ctx->last_act == cur_act) {
+		/*
+		 * check for forced reload conditions
+		 */
+		reload_pmcs = set->priv_flags & PFM_SETFL_PRIV_MOD_PMCS;
+		reload_pmds = set->priv_flags & PFM_SETFL_PRIV_MOD_PMDS;
+	} else {
+#ifndef CONFIG_SMP
+		pfm_check_save_prev_ctx();
+#endif
+		reload_pmcs = 1;
+		reload_pmds = 1;
+	}
+	/* consumed */
+	set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH;
+
+	if (reload_pmds)
+		pfm_arch_restore_pmds(ctx, set);
+
+	/*
+	 * need to check if had in-flight interrupt in
+	 * pfm_ctxswout_thread(). If at least one bit set, then we must replay
+	 * the interrupt to avoid losing some important performance data.
+	 *
+	 * npend_ovfls is cleared in interrupt handler
+	 */
+	if (set->npend_ovfls)
+		pfm_arch_resend_irq(ctx);
+
+	if (reload_pmcs)
+		pfm_arch_restore_pmcs(ctx, set);
+
+	/*
+	 * record current activation for this context
+	 */
+	__get_cpu_var(pmu_activation_number)++;
+	ctx->last_cpu = mycpu;
+	ctx->last_act = __get_cpu_var(pmu_activation_number);
+
+	/*
+	 * establish new ownership.
+	 */
+	pfm_set_pmu_owner(task, ctx);
+
+	pfm_arch_ctxswin_thread(task, ctx);
+done:
+	spin_unlock(&ctx->lock);
+}
+
+/*
+ * interrupts are masked, runqueue lock is held.
+ *
+ * In UP. we simply stop monitoring and leave the state
+ * in place, i.e., lazy save
+ */
+void __pfm_ctxswout_thread(struct task_struct *task,
+			   struct pfm_context *ctx)
+{
+	int need_save_pmds, is_active;
+
+	/*
+	 * we need to lock context because it could be accessed
+	 * from another CPU. Normally the schedule() functions
+	 * has masked interrupts which should be enough to
+	 * protect against PMU interrupts.
+	 */
+
+	spin_lock(&ctx->lock);
+
+	is_active = pfm_arch_is_active(ctx);
+
+	/*
+	 * stop monitoring and
+	 * collect pending overflow information
+	 * needed on ctxswin. We cannot afford to lose
+	 * a PMU interrupt.
+	 */
+	need_save_pmds = pfm_arch_ctxswout_thread(task, ctx);
+
+#ifdef CONFIG_SMP
+	/*
+	 * in SMP, release ownership of this PMU.
+	 * PMU interrupts are masked, so nothing
+	 * can happen.
+	 */
+	pfm_set_pmu_owner(NULL, NULL);
+
+	/*
+	 * On some architectures, it is necessary to read the
+	 * PMD registers to check for pending overflow in
+	 * pfm_arch_ctxswout_thread(). In that case, saving of
+	 * the PMDs  may be  done there and not here.
+	 */
+	if (need_save_pmds)
+		pfm_save_pmds(ctx);
+#endif
+	spin_unlock(&ctx->lock);
+}
+
+/**
+ * pfm_ctxsw_out - save PMU state on context switch out
+ * @prev: thread being switched out
+ * @next: thread being switched in
+ *
+ * We pass the next thread as on some platforms it may be necessary to
+ * pass some settings from the current thread to the next
+ *
+ * Interrupts are masked
+ */
+void pfm_ctxsw_out(struct task_struct *prev,
+		   struct task_struct *next)
+{
+	struct pfm_context *ctxp;
+
+	ctxp = prev->pfm_context;
+
+	if (ctxp)
+		__pfm_ctxswout_thread(prev, ctxp);
+}
+
+/**
+ * pfm_ctxsw_in - restore PMU state on context switch in
+ * @prev: thread being switched out
+ * @next: thread being switched in
+ *
+ * We pass the prev thread as on some platforms it may be necessary to
+ * pass some settings from the current thread to the next
+ *
+ * Interrupts are masked
+ */
+void pfm_ctxsw_in(struct task_struct *prev,
+		  struct task_struct *next)
+{
+	struct pfm_context *ctxn;
+
+	ctxn = next->pfm_context;
+
+	if (ctxn)
+		__pfm_ctxswin_thread(next, ctxn);
+
+}
diff --git a/perfmon/perfmon_file.c b/perfmon/perfmon_file.c
new file mode 100644
index 000000000000..12ec6b7bea73
--- /dev/null
+++ b/perfmon/perfmon_file.c
@@ -0,0 +1,306 @@
+/*
+ * perfmon_file.c: perfmon2 file input/output functions
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *                David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * 	http://perfmon2.sf.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/vfs.h>
+#include <linux/mount.h>
+#include <linux/perfmon_kern.h>
+#include "perfmon_priv.h"
+
+#define PFMFS_MAGIC 0xa0b4d889	/* perfmon filesystem magic number */
+
+struct pfm_controls pfm_controls = {
+	.task_group = PFM_GROUP_PERM_ANY,
+	.arg_mem_max = PAGE_SIZE,
+};
+
+static int __init enable_debug(char *str)
+{
+	pfm_controls.debug = 1;
+	PFM_INFO("debug output enabled\n");
+	return 1;
+}
+__setup("perfmon_debug", enable_debug);
+
+static int pfmfs_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
+{
+	return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC, mnt);
+}
+
+static struct file_system_type pfm_fs_type = {
+	.name     = "pfmfs",
+	.get_sb   = pfmfs_get_sb,
+	.kill_sb  = kill_anon_super,
+};
+
+/*
+ * pfmfs should _never_ be mounted by userland - too much of security hassle,
+ * no real gain from having the whole whorehouse mounted. So we don't need
+ * any operations on the root directory. However, we need a non-trivial
+ * d_name - pfm: will go nicely and kill the special-casing in procfs.
+ */
+static struct vfsmount *pfmfs_mnt;
+
+int __init pfm_init_fs(void)
+{
+	int err = register_filesystem(&pfm_fs_type);
+	if (!err) {
+		pfmfs_mnt = kern_mount(&pfm_fs_type);
+		err = PTR_ERR(pfmfs_mnt);
+		if (IS_ERR(pfmfs_mnt))
+			unregister_filesystem(&pfm_fs_type);
+		else
+			err = 0;
+	}
+	return err;
+}
+
+/*
+ * called either on explicit close() or from exit_files().
+ * Only the LAST user of the file gets to this point, i.e., it is
+ * called only ONCE.
+ *
+ * IMPORTANT: we get called ONLY when the refcnt on the file gets to zero
+ * (fput()),i.e, last task to access the file. Nobody else can access the
+ * file at this point.
+ *
+ * When called from exit_files(), the VMA has been freed because exit_mm()
+ * is executed before exit_files().
+ *
+ * When called from exit_files(), the current task is not yet ZOMBIE but we
+ * flush the PMU state to the context.
+ */
+static int __pfm_close(struct pfm_context *ctx, struct file *filp)
+{
+	unsigned long flags;
+	int state;
+	int can_free = 1, can_unload = 1;
+	int can_release = 0;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	state = ctx->state;
+
+	PFM_DBG("state=%d", state);
+
+	/*
+	 * check if unload is needed
+	 */
+	if (state == PFM_CTX_UNLOADED)
+		goto doit;
+
+#ifdef CONFIG_SMP
+	if (ctx->task != current) {
+		/*
+		 * switch context to zombie state
+		 */
+		ctx->state = PFM_CTX_ZOMBIE;
+
+		PFM_DBG("zombie ctx for [%d]", ctx->task->pid);
+		/*
+		 * PMU session will be released by monitored task when
+		 * it notices ZOMBIE state as part of pfm_unload_context()
+		 */
+		can_unload = can_free = 0;
+	}
+#endif
+	if (can_unload)
+		can_release  = !__pfm_unload_context(ctx);
+doit:
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	if (can_release)
+		pfm_session_release();
+
+	if (can_free)
+		pfm_free_context(ctx);
+
+	return 0;
+}
+
+/*
+ * called either on explicit close() or from exit_files().
+ * Only the LAST user of the file gets to this point, i.e., it is
+ * called only ONCE.
+ *
+ * IMPORTANT: we get called ONLY when the refcnt on the file gets to zero
+ * (fput()),i.e, last task to access the file. Nobody else can access the
+ * file at this point.
+ *
+ * When called from exit_files(), the VMA has been freed because exit_mm()
+ * is executed before exit_files().
+ *
+ * When called from exit_files(), the current task is not yet ZOMBIE but we
+ * flush the PMU state to the context.
+ */
+static int pfm_close(struct inode *inode, struct file *filp)
+{
+	struct pfm_context *ctx;
+
+	PFM_DBG("called filp=%p", filp);
+
+	ctx = filp->private_data;
+	if (ctx == NULL) {
+		PFM_ERR("no ctx");
+		return -EBADF;
+	}
+	return __pfm_close(ctx, filp);
+}
+
+static int pfm_no_open(struct inode *irrelevant, struct file *dontcare)
+{
+	PFM_DBG("pfm_file_ops");
+
+	return -ENXIO;
+}
+
+static unsigned int pfm_no_poll(struct file *filp, poll_table *wait)
+{
+	return 0;
+}
+
+static ssize_t pfm_read(struct file *filp, char __user *buf, size_t size,
+			loff_t *ppos)
+{
+	PFM_DBG("pfm_read called");
+	return -EINVAL;
+}
+
+static ssize_t pfm_write(struct file *file, const char __user *ubuf,
+			  size_t size, loff_t *ppos)
+{
+	PFM_DBG("pfm_write called");
+	return -EINVAL;
+}
+
+static int pfm_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+		     unsigned long arg)
+{
+	PFM_DBG("pfm_ioctl called");
+	return -EINVAL;
+}
+
+const struct file_operations pfm_file_ops = {
+	.llseek = no_llseek,
+	.read = pfm_read,
+	.write = pfm_write,
+	.ioctl = pfm_ioctl,
+	.open = pfm_no_open, /* special open to disallow open via /proc */
+	.release = pfm_close,
+	.poll = pfm_no_poll,
+};
+
+static int pfmfs_delete_dentry(struct dentry *dentry)
+{
+	return 1;
+}
+
+static struct dentry_operations pfmfs_dentry_operations = {
+	.d_delete = pfmfs_delete_dentry,
+};
+
+int pfm_alloc_fd(struct file **cfile)
+{
+	int fd, ret = 0;
+	struct file *file = NULL;
+	struct inode * inode;
+	char name[32];
+	struct qstr this;
+
+	fd = get_unused_fd();
+	if (fd < 0)
+		return -ENFILE;
+
+	ret = -ENFILE;
+
+	file = get_empty_filp();
+	if (!file)
+		goto out;
+
+	/*
+	 * allocate a new inode
+	 */
+	inode = new_inode(pfmfs_mnt->mnt_sb);
+	if (!inode)
+		goto out;
+
+	PFM_DBG("new inode ino=%ld @%p", inode->i_ino, inode);
+
+	inode->i_sb = pfmfs_mnt->mnt_sb;
+	inode->i_mode = S_IFCHR|S_IRUGO;
+	inode->i_uid = current_fsuid();
+	inode->i_gid = current_fsgid();
+
+	sprintf(name, "[%lu]", inode->i_ino);
+	this.name = name;
+	this.hash = inode->i_ino;
+	this.len = strlen(name);
+
+	ret = -ENOMEM;
+
+	/*
+	 * allocate a new dcache entry
+	 */
+	file->f_dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this);
+	if (!file->f_dentry)
+		goto out;
+
+	file->f_dentry->d_op = &pfmfs_dentry_operations;
+
+	d_add(file->f_dentry, inode);
+	file->f_vfsmnt = mntget(pfmfs_mnt);
+	file->f_mapping = inode->i_mapping;
+
+	file->f_op = &pfm_file_ops;
+	file->f_mode = FMODE_READ;
+	file->f_flags = O_RDONLY;
+	file->f_pos  = 0;
+
+	*cfile = file;
+
+	return fd;
+out:
+	if (file)
+		put_filp(file);
+	put_unused_fd(fd);
+	return ret;
+}
diff --git a/perfmon/perfmon_init.c b/perfmon/perfmon_init.c
new file mode 100644
index 000000000000..a92126d1687c
--- /dev/null
+++ b/perfmon/perfmon_init.c
@@ -0,0 +1,87 @@
+/*
+ * perfmon.c: perfmon2 global initialization functions
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *                David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * 	http://www.hpl.hp.com/research/linux/perfmon
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/perfmon_kern.h>
+#include "perfmon_priv.h"
+
+/*
+ * external variables
+ */
+DEFINE_PER_CPU(struct task_struct *, pmu_owner);
+DEFINE_PER_CPU(struct pfm_context  *, pmu_ctx);
+DEFINE_PER_CPU(u64, pmu_activation_number);
+
+int perfmon_disabled;	/* >0 if perfmon is disabled */
+
+/*
+ * global initialization routine, executed only once
+ */
+int __init pfm_init(void)
+{
+	PFM_LOG("version %u.%u", PFM_VERSION_MAJ, PFM_VERSION_MIN);
+
+	if (pfm_init_ctx())
+		goto error_disable;
+
+	if (pfm_init_fs())
+		goto error_disable;
+
+ 	if (pfm_init_sysfs())
+ 		goto error_disable;
+
+	/*
+	 * one time, arch-specific global initialization
+	 */
+	if (pfm_arch_init())
+		goto error_disable;
+
+	return 0;
+
+error_disable:
+	PFM_ERR("perfmon is disabled due to initialization error");
+	perfmon_disabled = 1;
+	return -1;
+}
+
+/*
+ * must use subsys_initcall() to ensure that the perfmon2 core
+ * is initialized before any PMU description module when they are
+ * compiled in.
+ */
+subsys_initcall(pfm_init);
diff --git a/perfmon/perfmon_intr.c b/perfmon/perfmon_intr.c
new file mode 100644
index 000000000000..d9e87bb11aa2
--- /dev/null
+++ b/perfmon/perfmon_intr.c
@@ -0,0 +1,295 @@
+/*
+ * perfmon_intr.c: perfmon2 interrupt handling
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *                David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * 	http://perfmon2.sf.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/perfmon_kern.h>
+#include "perfmon_priv.h"
+
+/**
+ * pfm_intr_process_64bit_ovfls - handle 64-bit counter emulation
+ * @ctx: context to operate on
+ * @set: set to operate on
+ *
+ * The function returns the number of 64-bit overflows detected.
+ *
+ * 64-bit software pmds are updated for overflowed pmd registers
+ *
+ * In any case, set->npend_ovfls is cleared
+ */
+static u16 pfm_intr_process_64bit_ovfls(struct pfm_context *ctx,
+					struct pfm_event_set *set)
+{
+	u16 i, num_ovfls, max_pmd, max_intr;
+	u16 num_64b_ovfls;
+	u64 old_val, new_val, ovfl_mask;
+
+	num_64b_ovfls = 0;
+
+	ovfl_mask = pfm_pmu_conf->ovfl_mask;
+	max_pmd = ctx->regs.max_pmd;
+	max_intr = ctx->regs.max_intr_pmd;
+
+	num_ovfls = set->npend_ovfls;
+
+	for (i = 0; num_ovfls; i++) {
+		/*
+		 * skip pmd which did not overflow
+		 */
+		if (!pfm_arch_bv_test_bit(i, set->povfl_pmds))
+			continue;
+
+		num_ovfls--;
+
+		/*
+		 * Update software value for counters ONLY
+		 *
+		 * Note that the pmd is not necessarily 0 at this point as
+		 * qualified events may have happened before the PMU was
+		 * frozen. The residual count is not taken into consideration
+		 * here but will be with any read of the pmd
+		 */
+		if (likely(pfm_arch_bv_test_bit(i, ctx->regs.cnt_pmds))) {
+			old_val = new_val = set->pmds[i];
+			new_val += 1 + ovfl_mask;
+			set->pmds[i] = new_val;
+		}  else {
+			/*
+			 * for non counters which interrupt, e.g., AMD IBS,
+			 * we consider this equivalent to a 64-bit counter
+			 * overflow.
+			 */
+			old_val = 1; new_val = 0;
+		}
+
+		/*
+		 * check for 64-bit overflow condition
+		 */
+		if (likely(old_val > new_val)) {
+			num_64b_ovfls++;
+		} else {
+			/*
+			 * on some PMU, it may be necessary to re-arm the PMD
+			 */
+			pfm_arch_ovfl_reset_pmd(ctx, i);
+		}
+
+		PFM_DBG_ovfl("pmd%u ovfl=%s new=0x%llx old=0x%llx "
+			     "hw_pmd=0x%llx",
+			     i,
+			     old_val > new_val ? "64-bit" : "HW",
+			     (unsigned long long)new_val,
+			     (unsigned long long)old_val,
+			     (unsigned long long)pfm_read_pmd(ctx, i));
+	}
+	/*
+	 * mark the overflows as consumed
+	 */
+	set->npend_ovfls = 0;
+	pfm_arch_bv_zero(set->povfl_pmds, max_intr);
+
+	return num_64b_ovfls;
+}
+
+/**
+ * pfm_overflow_handler - main overflow processing routine.
+ * @ctx: context to work on (always current context)
+ * @set: current event set
+ * @ip: interrupt instruction pointer
+ * @regs: machine state
+ */
+static void pfm_overflow_handler(struct pfm_context *ctx,
+				 struct pfm_event_set *set,
+				 unsigned long ip,
+				 struct pt_regs *regs)
+{
+	/*
+	 * skip ZOMBIE case
+	 */
+	if (unlikely(ctx->state == PFM_CTX_ZOMBIE))
+		goto stop_monitoring;
+
+	PFM_DBG_ovfl("intr_pmds=0x%llx npend=%u ip=%p u_pmds=0x%llx",
+		     (unsigned long long)set->povfl_pmds[0],
+		     set->npend_ovfls,
+		     (void *)ip,
+		     (unsigned long long)set->used_pmds[0]);
+
+	/*
+	 * return number of 64-bit overflows
+	 */
+	pfm_intr_process_64bit_ovfls(ctx, set);
+
+	return;
+
+stop_monitoring:
+	/*
+	 * Does not happen for a self-monitored context.
+	 * We cannot attach to kernel-only thread, thus it is safe to
+	 * set TIF bits, i.e., the thread will eventually leave the kernel
+	 * or die and either we will catch the context and clean it up in
+	 * pfm_handler_work() or pfm_exit_thread().
+	 *
+	 * Mask until we get to pfm_handle_work()
+	 * pfm_mask_monitoring(ctx, set);
+	 */
+	PFM_DBG_ovfl("ctx is zombie, converted to spurious");
+	pfm_post_work(current, ctx, PFM_WORK_ZOMBIE);
+}
+
+/**
+ * __pfm_interrupt_handler - 1st level interrupt handler
+ * @ip: interrupted instruction pointer
+ * @regs: machine state
+ *
+ * Function is static because we use a wrapper to easily capture timing infos.
+ *
+ * Context locking necessary to avoid concurrent accesses from other CPUs
+ */
+static void __pfm_interrupt_handler(unsigned long ip, struct pt_regs *regs)
+{
+	struct task_struct *task;
+	struct pfm_context *ctx;
+	struct pfm_event_set *set;
+
+
+	task = __get_cpu_var(pmu_owner);
+	ctx = __get_cpu_var(pmu_ctx);
+
+	/*
+	 * verify if there is a context on this CPU
+	 */
+	if (unlikely(ctx == NULL)) {
+		PFM_DBG_ovfl("no ctx");
+		goto spurious;
+	}
+
+	/*
+	 * we need to lock context because it could be accessed
+	 * from another CPU. Depending on the priority level of
+	 * the PMU interrupt or the arch, it may be necessary to
+	 * mask interrupts alltogether to avoid race condition with
+	 * the timer interrupt in case of time-based set switching,
+	 * for instance.
+	 */
+	spin_lock(&ctx->lock);
+
+	set = ctx->active_set;
+
+	/*
+	 * For SMP per-thread, it is not possible to have
+	 * owner != NULL && task != current.
+	 *
+	 * For UP per-thread, because of lazy save, it
+	 * is possible to receive an interrupt in another task
+	 * which is not using the PMU. This means
+	 * that the interrupt was in-flight at the
+	 * time of pfm_ctxswout_thread(). In that
+	 * case, it will be replayed when the task
+	 * is scheduled again. Hence we convert to spurious.
+	 *
+	 * The basic rule is that an overflow is always
+	 * processed in the context of the task that
+	 * generated it for all per-thread contexts.
+	 */
+#ifndef CONFIG_SMP
+	if (unlikely((task && current->pfm_context != ctx))) {
+		PFM_DBG_ovfl("spurious: not owned by current task");
+		goto spurious;
+	}
+#endif
+	/*
+	 * check that monitoring is active, otherwise convert
+	 * to spurious
+	 */
+	if (unlikely(!pfm_arch_is_active(ctx))) {
+		PFM_DBG_ovfl("spurious: monitoring non active");
+		goto spurious;
+	}
+
+	/*
+	 * freeze PMU and collect overflowed PMD registers
+	 * into set->povfl_pmds. Number of overflowed PMDs
+	 * reported in set->npend_ovfls
+	 */
+	pfm_arch_intr_freeze_pmu(ctx, set);
+
+	/*
+	 * no overflow detected, interrupt may have come
+	 * from the previous thread running on this CPU
+	 */
+	if (unlikely(!set->npend_ovfls)) {
+		PFM_DBG_ovfl("no npend_ovfls");
+		goto spurious;
+	}
+
+	/*
+	 * invoke actual handler
+	 */
+	pfm_overflow_handler(ctx, set, ip, regs);
+
+	/*
+	 * unfreeze PMU
+	 */
+	pfm_arch_intr_unfreeze_pmu(ctx);
+
+	spin_unlock(&ctx->lock);
+
+	return;
+
+spurious:
+	/* ctx may be NULL */
+	pfm_arch_intr_unfreeze_pmu(ctx);
+	if (ctx)
+		spin_unlock(&ctx->lock);
+}
+
+
+/**
+ * pfm_interrupt_handler - 1st level interrupt handler
+ * @ip: interrupt instruction pointer
+ * @regs: machine state
+ *
+ * Function called from the low-level assembly code or arch-specific perfmon
+ * code. Simple wrapper used for timing purpose. Actual work done in
+ * __pfm_overflow_handler()
+ */
+void pfm_interrupt_handler(unsigned long ip, struct pt_regs *regs)
+{
+	BUG_ON(!irqs_disabled());
+	__pfm_interrupt_handler(ip, regs);
+}
diff --git a/perfmon/perfmon_pmu.c b/perfmon/perfmon_pmu.c
new file mode 100644
index 000000000000..0e44ee8530a6
--- /dev/null
+++ b/perfmon/perfmon_pmu.c
@@ -0,0 +1,269 @@
+/*
+ * perfmon_pmu.c: perfmon2 PMU configuration management
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *                David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * 	http://perfmon2.sf.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/module.h>
+#include <linux/perfmon_kern.h>
+#include "perfmon_priv.h"
+
+#ifndef CONFIG_MODULE_UNLOAD
+#define module_refcount(n)	1
+#endif
+
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_pmu_conf_lock);
+
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_pmu_acq_lock);
+static u32 pfm_pmu_acquired;
+
+/*
+ * perfmon core must acces PMU information ONLY through pfm_pmu_conf
+ * if pfm_pmu_conf is NULL, then no description is registered
+ */
+struct pfm_pmu_config	*pfm_pmu_conf;
+EXPORT_SYMBOL(pfm_pmu_conf);
+
+/**
+ * pfm_pmu_regdesc_init -- initialize regdesc structure from PMU table
+ * @regs: the regdesc structure to initialize
+ * @excl_type: the register type(s) to exclude from this regdesc
+ * @unvail_pmcs: unavailable PMC registers
+ * @unavail_pmds: unavailable PMD registers
+ */
+static void pfm_pmu_regdesc_init(struct pfm_regdesc *regs, int excl_type,
+				 u64 *unavail_pmcs, u64 *unavail_pmds)
+{
+	struct pfm_regmap_desc *d;
+	u16 n, n2, n_counters, i;
+	int max1, max2, max3;
+
+	/*
+	 * compute the number of implemented PMC from the
+	 * description table
+	 */
+	n = 0;
+	max1 = max2 = -1;
+	d = pfm_pmu_conf->pmc_desc;
+	for (i = 0; i < pfm_pmu_conf->num_pmc_entries;  i++, d++) {
+		if (!(d->type & PFM_REG_I))
+			continue;
+
+		if (pfm_arch_bv_test_bit(i, unavail_pmcs))
+			continue;
+
+		if (d->type & excl_type)
+			continue;
+
+		pfm_arch_bv_set_bit(i, regs->pmcs);
+
+		max1 = i;
+		n++;
+	}
+
+	regs->max_pmc = max1 + 1;
+	regs->num_pmcs = n;
+
+	n = n_counters = n2 = 0;
+	max1 = max2 = max3 = -1;
+	d = pfm_pmu_conf->pmd_desc;
+	for (i = 0; i < pfm_pmu_conf->num_pmd_entries;  i++, d++) {
+		if (!(d->type & PFM_REG_I))
+			continue;
+
+		if (pfm_arch_bv_test_bit(i, unavail_pmds))
+			continue;
+
+		if (d->type & excl_type)
+			continue;
+
+		pfm_arch_bv_set_bit(i, regs->pmds);
+		max1 = i;
+		n++;
+
+		/*
+		 * read-write registers
+		 */
+		if (!(d->type & PFM_REG_RO)) {
+			pfm_arch_bv_set_bit(i, regs->rw_pmds);
+			max3 = i;
+			n2++;
+		}
+
+		/*
+		 * counter registers
+		 */
+		if (d->type & PFM_REG_C64) {
+			pfm_arch_bv_set_bit(i, regs->cnt_pmds);
+			n_counters++;
+		}
+
+		/*
+		 * PMD with intr capabilities
+		 */
+		if (d->type & PFM_REG_INTR) {
+			pfm_arch_bv_set_bit(i, regs->intr_pmds);
+			max2 = i;
+		}
+	}
+
+	regs->max_pmd = max1 + 1;
+	regs->max_intr_pmd  = max2 + 1;
+
+	regs->num_counters = n_counters;
+	regs->num_pmds = n;
+	regs->max_rw_pmd = max3 + 1;
+	regs->num_rw_pmd = n2;
+}
+
+int pfm_pmu_register(struct pfm_pmu_config *cfg)
+{
+	int ret = -EBUSY;
+
+	if (perfmon_disabled) {
+		PFM_INFO("perfmon disabled, cannot add PMU description");
+		return -ENOSYS;
+	}
+
+	spin_lock(&pfm_pmu_conf_lock);
+
+	if (pfm_pmu_conf)
+		goto unlock;
+
+	pfm_pmu_conf = cfg;
+	pfm_pmu_conf->ovfl_mask = (1ULL << cfg->counter_width) - 1;
+
+	ret = pfm_sysfs_add_pmu(pfm_pmu_conf);
+	if (ret)
+		pfm_pmu_conf = NULL;
+
+unlock:
+	spin_unlock(&pfm_pmu_conf_lock);
+
+	if (ret)
+		PFM_INFO("register %s PMU error %d", cfg->pmu_name, ret);
+	else
+		PFM_INFO("%s PMU installed", cfg->pmu_name);
+	return ret;
+}
+
+/*
+ * acquire PMU resource from lower-level PMU register allocator
+ * (currently perfctr-watchdog.c)
+ *
+ * acquisition is done when the first context is created (and not
+ * when it is loaded). We grab all that is defined in the description
+ * module and then we make adjustments at the arch-specific level.
+ *
+ * The PMU resource is released when the last perfmon context is
+ * destroyed.
+ *
+ * interrupts are not masked
+ */
+int pfm_pmu_acquire(struct pfm_context *ctx)
+{
+	u64 unavail_pmcs[PFM_PMC_BV];
+	u64 unavail_pmds[PFM_PMD_BV];
+	int ret = 0;
+
+	spin_lock(&pfm_pmu_acq_lock);
+
+	PFM_DBG("pmu_acquired=%d", pfm_pmu_acquired);
+
+	pfm_pmu_acquired++;
+
+	if (pfm_pmu_acquired == 1) {
+
+		memset(unavail_pmcs, 0, sizeof(unavail_pmcs));
+		memset(unavail_pmds, 0, sizeof(unavail_pmds));
+
+		ret = pfm_arch_pmu_acquire(unavail_pmcs, unavail_pmds);
+		if (ret) {
+			pfm_pmu_acquired--;
+		} else {
+			memset(&pfm_pmu_conf->regs_all, 0, sizeof(struct pfm_regdesc));
+
+			pfm_pmu_regdesc_init(&pfm_pmu_conf->regs_all, 0,
+				  	     unavail_pmcs,
+					     unavail_pmds);
+
+			PFM_DBG("regs_all.pmcs=0x%llx",
+				(unsigned long long)pfm_pmu_conf->regs_all.pmcs[0]);
+
+			/* available PMU ressources */
+			PFM_DBG("PMU acquired: %u PMCs, %u PMDs, %u counters",
+				pfm_pmu_conf->regs_all.num_pmcs,
+				pfm_pmu_conf->regs_all.num_pmds,
+				pfm_pmu_conf->regs_all.num_counters);
+		}
+	}
+	spin_unlock(&pfm_pmu_acq_lock);
+	/*
+	 * copy global regdesc to context (for future extensions)
+	 */
+	ctx->regs = pfm_pmu_conf->regs_all;
+
+	return ret;
+}
+
+/*
+ * release the PMU resource
+ *
+ * actual release happens when last context is destroyed
+ *
+ * interrupts are not masked
+ */
+void pfm_pmu_release(void)
+{
+	BUG_ON(irqs_disabled());
+
+	/*
+	 * we need to use a spinlock because release takes some time
+	 * and we may have a race with pfm_pmu_acquire()
+	 */
+	spin_lock(&pfm_pmu_acq_lock);
+
+	PFM_DBG("pmu_acquired=%d", pfm_pmu_acquired);
+
+	/*
+	 * we decouple test and decrement because if we had errors
+	 * in pfm_pmu_acquire(), we still come here on pfm_context_free()
+	 * but with pfm_pmu_acquire=0
+	 */
+	if (pfm_pmu_acquired > 0 && --pfm_pmu_acquired == 0) {
+		pfm_arch_pmu_release();
+		PFM_DBG("PMU released");
+	}
+	spin_unlock(&pfm_pmu_acq_lock);
+}
diff --git a/perfmon/perfmon_priv.h b/perfmon/perfmon_priv.h
new file mode 100644
index 000000000000..f1068e5ff308
--- /dev/null
+++ b/perfmon/perfmon_priv.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+
+#ifndef __PERFMON_PRIV_H__
+#define __PERFMON_PRIV_H__
+/*
+ * This file contains all the definitions of data structures, variables, macros
+ * that are to private to the generic code, i.e., not shared with any code that
+ * lives under arch/ or include/asm-XX
+ *
+ * For shared definitions, use include/linux/perfmon_kern.h
+ */
+
+#ifdef CONFIG_PERFMON
+
+/*
+ * context lazy save/restore activation count
+ */
+#define PFM_INVALID_ACTIVATION	((u64)~0)
+
+DECLARE_PER_CPU(u64, pmu_activation_number);
+
+static inline void pfm_set_pmu_owner(struct task_struct *task,
+				     struct pfm_context *ctx)
+{
+	__get_cpu_var(pmu_owner) = task;
+	__get_cpu_var(pmu_ctx) = ctx;
+}
+
+int pfm_init_ctx(void);
+int __pfm_write_pmcs(struct pfm_context *ctx, struct pfarg_pmr *req,
+		     int count);
+int __pfm_write_pmds(struct pfm_context *ctx, struct pfarg_pmr *req,
+		     int count);
+int __pfm_read_pmds(struct pfm_context *ctx, struct pfarg_pmr *req, int count);
+
+int pfm_session_acquire(void);
+void pfm_session_release(void);
+
+int  pfm_init_sysfs(void);
+
+int __pfm_create_context(__u32 ctx_flags, struct pfarg_sinfo *sif,
+			 struct pfm_context **new_ctx);
+void pfm_free_context(struct pfm_context *ctx);
+void pfm_undo_create(int fd, struct pfm_context *ctx);
+
+int __pfm_stop(struct pfm_context *ctx);
+int __pfm_start(struct pfm_context *ctx);
+
+int __pfm_load_context(struct pfm_context *ctx, struct task_struct *task);
+int __pfm_unload_context(struct pfm_context *ctx);
+
+int pfm_alloc_fd(struct file **cfile);
+
+ssize_t pfm_sysfs_res_show(char *buf, size_t sz, int what);
+
+int pfm_pmu_acquire(struct pfm_context *ctx);
+void pfm_pmu_release(void);
+
+void pfm_save_pmds(struct pfm_context *ctx);
+
+/*
+ * check_mask bitmask values for pfm_check_task_state()
+ */
+#define PFM_CMD_STOPPED		0x01	/* command needs thread stopped */
+#define PFM_CMD_UNLOADED	0x02	/* command needs ctx unloaded */
+#define PFM_CMD_UNLOAD		0x04	/* command is unload */
+
+/**
+ * pfm_save_prev_ctx - check if previous context exists and save state
+ *
+ * called from pfm_load_ctx_thread() and __pfm_ctxsin_thread() to
+ * check if previous context exists. If so saved its PMU state. This is used
+ * only for UP kernels.
+ *
+ * PMU ownership is not cleared because the function is always called while
+ * trying to install a new owner.
+ */
+static inline void pfm_check_save_prev_ctx(void)
+{
+#ifdef CONFIG_SMP
+	struct pfm_context *ctxp;
+
+	ctxp = __get_cpu_var(pmu_ctx);
+	if (!ctxp)
+		return;
+	/*
+	 * in UP per-thread, due to lazy save
+	 * there could be a context from another
+	 * task. We need to push it first before
+	 * installing our new state
+	 */
+	pfm_save_pmds(ctxp);
+	/*
+	 * do not clear ownership because we rewrite
+	 * right away
+	 */
+#endif
+}
+
+int pfm_init_fs(void);
+
+static inline void pfm_post_work(struct task_struct *task,
+				 struct pfm_context *ctx, int type)
+{
+	ctx->flags.work_type = type;
+	set_tsk_thread_flag(task, TIF_PERFMON_WORK);
+}
+
+#define PFM_PMC_STK_ARG	PFM_ARCH_PMC_STK_ARG
+#define PFM_PMD_STK_ARG	PFM_ARCH_PMD_STK_ARG
+
+#endif /* CONFIG_PERFMON */
+
+#endif /* __PERFMON_PRIV_H__ */
diff --git a/perfmon/perfmon_res.c b/perfmon/perfmon_res.c
new file mode 100644
index 000000000000..0af9dfa98b22
--- /dev/null
+++ b/perfmon/perfmon_res.c
@@ -0,0 +1,223 @@
+/*
+ * perfmon_res.c:  perfmon2 resource allocations
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *                David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * 	http://perfmon2.sf.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/perfmon_kern.h>
+#include "perfmon_priv.h"
+
+/*
+ * global information about all sessions
+ */
+struct pfm_resources {
+	cpumask_t sys_cpumask;     /* bitmask of used cpus */
+	u32 thread_sessions; /* #num loaded per-thread sessions */
+};
+
+static struct pfm_resources pfm_res;
+
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_res_lock);
+
+/**
+ * pfm_session_acquire - reserve a per-thread session
+ *
+ * return:
+ * 	 0    : success
+ * 	-EBUSY: if conflicting session exist
+ */
+int pfm_session_acquire(void)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	/*
+	 * validy checks on cpu_mask have been done upstream
+	 */
+	spin_lock_irqsave(&pfm_res_lock, flags);
+
+	PFM_DBG("in  thread=%u",
+		pfm_res.thread_sessions);
+
+	pfm_res.thread_sessions++;
+
+	PFM_DBG("out thread=%u ret=%d",
+		pfm_res.thread_sessions,
+		ret);
+
+	spin_unlock_irqrestore(&pfm_res_lock, flags);
+
+	return ret;
+}
+
+/**
+ * pfm_session_release - release a per-thread session
+ *
+ * called from __pfm_unload_context()
+ */
+void pfm_session_release(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pfm_res_lock, flags);
+
+	PFM_DBG("in thread=%u",
+		pfm_res.thread_sessions);
+
+	pfm_res.thread_sessions--;
+
+	PFM_DBG("out thread=%u",
+		pfm_res.thread_sessions);
+
+	spin_unlock_irqrestore(&pfm_res_lock, flags);
+}
+
+/**
+ * pfm_session_allcpus_acquire - acquire per-cpu sessions on all available cpus
+ *
+ * currently used by Oprofile on X86
+ */
+int pfm_session_allcpus_acquire(void)
+{
+	unsigned long flags;
+	u32 nsys_cpus, cpu;
+	int ret = -EBUSY;
+
+	spin_lock_irqsave(&pfm_res_lock, flags);
+
+	nsys_cpus = cpus_weight(pfm_res.sys_cpumask);
+
+	PFM_DBG("in  sys=%u task=%u",
+		nsys_cpus,
+		pfm_res.thread_sessions);
+
+	if (nsys_cpus) {
+		PFM_DBG("already some system-wide sessions");
+		goto abort;
+	}
+
+	/*
+	 * cannot mix system wide and per-task sessions
+	 */
+	if (pfm_res.thread_sessions) {
+		PFM_DBG("%u conflicting thread_sessions",
+			pfm_res.thread_sessions);
+		goto abort;
+	}
+
+	for_each_online_cpu(cpu) {
+		cpu_set(cpu, pfm_res.sys_cpumask);
+		nsys_cpus++;
+	}
+
+	PFM_DBG("out sys=%u task=%u",
+		nsys_cpus,
+		pfm_res.thread_sessions);
+
+	ret = 0;
+abort:
+	spin_unlock_irqrestore(&pfm_res_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(pfm_session_allcpus_acquire);
+
+/**
+ * pfm_session_allcpus_release - relase per-cpu sessions on all cpus
+ *
+ * currently used by Oprofile code
+ */
+void pfm_session_allcpus_release(void)
+{
+	unsigned long flags;
+	u32 nsys_cpus, cpu;
+
+	spin_lock_irqsave(&pfm_res_lock, flags);
+
+	nsys_cpus = cpus_weight(pfm_res.sys_cpumask);
+
+	PFM_DBG("in  sys=%u task=%u",
+		nsys_cpus,
+		pfm_res.thread_sessions);
+
+	/*
+	 * XXX: could use __cpus_clear() with nbits
+	 */
+	for_each_online_cpu(cpu) {
+		cpu_clear(cpu, pfm_res.sys_cpumask);
+		nsys_cpus--;
+	}
+
+	PFM_DBG("out sys=%u task=%u",
+		nsys_cpus,
+		pfm_res.thread_sessions);
+
+	spin_unlock_irqrestore(&pfm_res_lock, flags);
+}
+EXPORT_SYMBOL(pfm_session_allcpus_release);
+
+/**
+ * pfm_sysfs_res_show - return currnt resourcde usage for sysfs
+ * @buf: buffer to hold string in return
+ * @sz: size of buf
+ * @what: what to produce
+ *        what=0 : thread_sessions
+ *        what=1 : cpus_weight(sys_cpumask)
+ *        what=2 : smpl_buf_mem_cur
+ *        what=3 : pmu model name
+ *
+ * called from perfmon_sysfs.c
+ * return number of bytes written into buf (up to sz)
+ */
+ssize_t pfm_sysfs_res_show(char *buf, size_t sz, int what)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pfm_res_lock, flags);
+
+	switch (what) {
+	case 0: snprintf(buf, sz, "%u\n", pfm_res.thread_sessions);
+		break;
+	case 1: snprintf(buf, sz, "%d\n", cpus_weight(pfm_res.sys_cpumask));
+		break;
+	case 3:
+		snprintf(buf, sz, "%s\n",
+			pfm_pmu_conf ?	pfm_pmu_conf->pmu_name
+				     :	"unknown\n");
+	}
+	spin_unlock_irqrestore(&pfm_res_lock, flags);
+	return strlen(buf);
+}
diff --git a/perfmon/perfmon_rw.c b/perfmon/perfmon_rw.c
new file mode 100644
index 000000000000..bea77d455794
--- /dev/null
+++ b/perfmon/perfmon_rw.c
@@ -0,0 +1,449 @@
+/*
+ * perfmon.c: perfmon2 PMC/PMD read/write system calls
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *                David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * 	http://perfmon2.sf.net/
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/perfmon_kern.h>
+#include "perfmon_priv.h"
+
+/**
+ * is_invalid -- check if register index is within limits
+ * @cnum: register index
+ * @impl: bitmask of implemented registers
+ * @max: highest implemented registers + 1
+ *
+ * return:
+ *    0 is register index is valid
+ *    1 if invalid
+ */
+static inline int is_invalid(u16 cnum, u64 *impl, u16 max)
+{
+	return cnum >= max || !pfm_arch_bv_test_bit(cnum, impl);
+}
+
+/**
+ * update_used_reg -- updated used_pmcs for a single PMD
+ * @set: set to update
+ * @cnum: new PMD to add
+ *
+ * This function adds the pmds and pmcs depending on PMD cnum
+ */
+static inline void update_used_reg(struct pfm_context *ctx,
+				   struct pfm_event_set *set, u16 cnum)
+{
+	pfm_arch_bv_or(set->used_pmcs,
+		       set->used_pmcs,
+		       pfm_pmu_conf->pmd_desc[cnum].dep_pmcs,
+		       ctx->regs.max_pmc);
+}
+
+/**
+ * update_changes -- update nused_pmcs, nused_pmds, write newly touched pmcs
+ * @ctx: context to use
+ * @set: event set to use
+ * @old_used_pmcs: former used_pmc bitmask
+ *
+ * This function updates nused_pmcs and nused_pmds after the last modificiation
+ * to an event set. When new pmcs are used, then they must be initialized such
+ * that we do not pick up stale values from another session.
+ */
+static inline int update_changes(struct pfm_context *ctx, struct pfm_event_set *set,
+				 u64 *old_used_pmcs)
+{
+	struct pfarg_pmr req;
+	u16 max_pmc, max_pmd;
+	int n, p, q, ret = 0;
+
+	max_pmd = ctx->regs.max_pmd;
+	max_pmc = ctx->regs.max_pmc;
+
+	/*
+	 * update used counts
+	 */
+	set->nused_pmds = pfm_arch_bv_weight(set->used_pmds, max_pmd);
+	set->nused_pmcs = pfm_arch_bv_weight(set->used_pmcs, max_pmc);
+
+	PFM_DBG("u_pmds=0x%llx nu_pmds=%u u_pmcs=0x%llx nu_pmcs=%u",
+		(unsigned long long)set->used_pmds[0],
+		set->nused_pmds,
+		(unsigned long long)set->used_pmcs[0],
+		set->nused_pmcs);
+
+	memset(&req, 0, sizeof(req));
+
+	n = pfm_arch_bv_weight(set->used_pmcs, max_pmc);
+	for(p = 0; n; n--, p = q+1) {
+		q = pfm_arch_bv_find_next_bit(set->used_pmcs, max_pmc, p);
+
+		if (pfm_arch_bv_test_bit(q, old_used_pmcs))
+			continue;
+
+		req.reg_num = q;
+		req.reg_value = set->pmcs[q];
+
+		ret = __pfm_write_pmcs(ctx, &req, 1);
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
+/**
+ * __pfm_write_pmds - modify data registers
+ * @ctx: context to operate on
+ * @req: pfarg_pmd_t request from user
+ * @count: number of element in the pfarg_pmd_t vector
+ *
+ * The function succeeds whether the context is attached or not.
+ * When attached to another thread, that thread must be stopped.
+ *
+ * The context is locked and interrupts are disabled.
+ */
+int __pfm_write_pmds(struct pfm_context *ctx, struct pfarg_pmr *req, int count)
+{
+	struct pfm_event_set *set;
+	u64 old_used_pmcs[PFM_PMC_BV];
+	u64 value, ovfl_mask;
+	u64 *impl_pmds;
+	u16 cnum, pmd_type, max_pmd;
+	int i, can_access_pmu;
+	int ret;
+	pfm_pmd_check_t	wr_func;
+
+	ovfl_mask = pfm_pmu_conf->ovfl_mask;
+	max_pmd	= ctx->regs.max_pmd;
+	impl_pmds = ctx->regs.pmds;
+	wr_func = pfm_pmu_conf->pmd_write_check;
+
+	can_access_pmu = 0;
+
+	/*
+	 * we cannot access the actual PMD registers when monitoring is masked
+	 */
+	if (unlikely(ctx->state == PFM_CTX_LOADED))
+		can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task;
+
+	ret = -EINVAL;
+	set = ctx->active_set;
+
+	pfm_arch_bv_copy(old_used_pmcs, set->used_pmcs,
+			 ctx->regs.max_pmc);
+
+	for (i = 0; i < count; i++, req++) {
+
+		cnum = req->reg_num;
+
+		/*
+		 * cannot write to unexisting
+		 * writes to read-only register are ignored
+		 */
+		if (unlikely(is_invalid(cnum, impl_pmds, max_pmd))) {
+			PFM_DBG("pmd%u is not available", cnum);
+			goto error;
+		}
+
+		pmd_type = pfm_pmu_conf->pmd_desc[cnum].type;
+
+		/*
+		 * execute write checker, if any
+		 */
+		if (unlikely(wr_func && (pmd_type & PFM_REG_WC))) {
+			ret = (*wr_func)(ctx, set, req);
+			if (ret)
+				goto error;
+
+		}
+
+		value = req->reg_value;
+
+		/*
+		 * we reprogram the PMD hence, we clear any pending
+		 * ovfl. Does affect ovfl switch on restart but new
+		 * value has already been established here
+		 */
+		if (pfm_arch_bv_test_bit(cnum, set->povfl_pmds)) {
+			set->npend_ovfls--;
+			pfm_arch_bv_clear_bit(cnum, set->povfl_pmds);
+		}
+
+		/*
+		 * update value
+		 */
+		set->pmds[cnum] = value;
+
+		pfm_arch_bv_set_bit(cnum, set->used_pmds);
+		update_used_reg(ctx, set, cnum);
+
+		set->priv_flags |= PFM_SETFL_PRIV_MOD_PMDS;
+		if (can_access_pmu)
+			pfm_write_pmd(ctx, cnum, value);
+
+		/*
+		 * update number of used PMD registers
+		 */
+		set->nused_pmds = pfm_arch_bv_weight(set->used_pmds,
+						     max_pmd);
+
+		PFM_DBG("pmd%u=0x%llx a_pmu=%d "
+			"ctx_pmd=0x%llx "
+			" u_pmds=0x%llx nu_pmds=%u ",
+			cnum,
+			(unsigned long long)value,
+			can_access_pmu,
+			(unsigned long long)set->pmds[cnum],
+			(unsigned long long)set->used_pmds[0],
+			set->nused_pmds);
+	}
+	ret = 0;
+error:
+	update_changes(ctx, set, old_used_pmcs);
+	/*
+	 * make changes visible
+	 */
+	if (can_access_pmu)
+		pfm_arch_serialize();
+
+	return ret;
+}
+
+/**
+ * __pfm_write_pmcs - modify config registers
+ * @ctx: context to operate on
+ * @req: pfarg_pmc_t request from user
+ * @count: number of element in the pfarg_pmc_t vector
+ *
+ *
+ * The function succeeds whether the context is * attached or not.
+ * When attached to another thread, that thread must be stopped.
+ *
+ * The context is locked and interrupts are disabled.
+ */
+int __pfm_write_pmcs(struct pfm_context *ctx, struct pfarg_pmr *req, int count)
+{
+	struct pfm_event_set *set;
+	u64 value, dfl_val, rsvd_msk;
+	u64 *impl_pmcs;
+	int i, can_access_pmu;
+	int ret;
+	u16 cnum, pmc_type, max_pmc;
+	pfm_pmc_check_t	wr_func;
+
+	wr_func = pfm_pmu_conf->pmc_write_check;
+	max_pmc = ctx->regs.max_pmc;
+	impl_pmcs = ctx->regs.pmcs;
+
+	can_access_pmu = 0;
+
+	/*
+	 * we cannot access the actual PMC registers when monitoring is masked
+	 */
+	if (unlikely(ctx->state == PFM_CTX_LOADED))
+		can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task;
+
+	ret = -EINVAL;
+	set = ctx->active_set;
+
+	for (i = 0; i < count; i++, req++) {
+
+		cnum = req->reg_num;
+		value = req->reg_value;
+
+		/*
+		 * no access to unavailable PMC register
+		 */
+		if (unlikely(is_invalid(cnum, impl_pmcs, max_pmc))) {
+			PFM_DBG("pmc%u is not available", cnum);
+			goto error;
+		}
+
+		pmc_type = pfm_pmu_conf->pmc_desc[cnum].type;
+		dfl_val = pfm_pmu_conf->pmc_desc[cnum].dfl_val;
+		rsvd_msk = pfm_pmu_conf->pmc_desc[cnum].rsvd_msk;
+
+		/*
+		 * set reserved bits to default values
+		 * (reserved bits must be 1 in rsvd_msk)
+		 */
+		value = (value & ~rsvd_msk) | (dfl_val & rsvd_msk);
+
+		/*
+		 * execute write checker, if any
+		 */
+		if (likely(wr_func && (pmc_type & PFM_REG_WC))) {
+			req->reg_value = value;
+			ret = (*wr_func)(ctx, set, req);
+			if (ret)
+				goto error;
+			value = req->reg_value;
+		}
+
+		/*
+		 * Now we commit the changes
+		 */
+
+		/*
+		 * mark PMC register as used
+		 * We do not track associated PMC register based on
+		 * the fact that they will likely need to be written
+		 * in order to become useful at which point the statement
+		 * below will catch that.
+		 *
+		 * The used_pmcs bitmask is only useful on architectures where
+		 * the PMC needs to be modified for particular bits, especially
+		 * on overflow or to stop/start.
+		 */
+		if (!pfm_arch_bv_test_bit(cnum, set->used_pmcs)) {
+			pfm_arch_bv_set_bit(cnum, set->used_pmcs);
+			set->nused_pmcs++;
+		}
+
+		set->pmcs[cnum] = value;
+
+		set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS;
+		if (can_access_pmu)
+			pfm_arch_write_pmc(ctx, cnum, value);
+
+		PFM_DBG("pmc%u=0x%llx a_pmu=%d "
+			"u_pmcs=0x%llx nu_pmcs=%u",
+			cnum,
+			(unsigned long long)value,
+			can_access_pmu,
+			(unsigned long long)set->used_pmcs[0],
+			set->nused_pmcs);
+	}
+	ret = 0;
+error:
+	/*
+	 * make sure the changes are visible
+	 */
+	if (can_access_pmu)
+		pfm_arch_serialize();
+
+	return ret;
+}
+
+/**
+ * __pfm_read_pmds - read data registers
+ * @ctx: context to operate on
+ * @req: pfarg_pmd_t request from user
+ * @count: number of element in the pfarg_pmd_t vector
+ *
+ *
+ * The function succeeds whether the context is attached or not.
+ * When attached to another thread, that thread must be stopped.
+ *
+ * The context is locked and interrupts are disabled.
+ */
+int __pfm_read_pmds(struct pfm_context *ctx, struct pfarg_pmr *req, int count)
+{
+	u64 val = 0, ovfl_mask, hw_val;
+	u64 *impl_pmds;
+	struct pfm_event_set *set;
+	int i, ret, can_access_pmu = 0;
+	u16 cnum, pmd_type, max_pmd;
+
+	ovfl_mask = pfm_pmu_conf->ovfl_mask;
+	impl_pmds = ctx->regs.pmds;
+	max_pmd   = ctx->regs.max_pmd;
+
+	if (likely(ctx->state == PFM_CTX_LOADED)) {
+		can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task;
+		if (can_access_pmu)
+			pfm_arch_serialize();
+	}
+
+	/*
+	 * on both UP and SMP, we can only read the PMD from the hardware
+	 * register when the task is the owner of the local PMU.
+	 */
+	ret = -EINVAL;
+	set = ctx->active_set;
+
+	for (i = 0; i < count; i++, req++) {
+
+		cnum = req->reg_num;
+
+		if (unlikely(is_invalid(cnum, impl_pmds, max_pmd))) {
+			PFM_DBG("pmd%u is not implemented/unaccessible", cnum);
+			goto error;
+		}
+
+		pmd_type = pfm_pmu_conf->pmd_desc[cnum].type;
+
+		/*
+		 * it is not possible to read a PMD which was not requested:
+		 * 	- explicitly written via pfm_write_pmds()
+		 * 	- provided as a reg_smpl_pmds[] to another PMD during
+		 * 	  pfm_write_pmds()
+		 *
+		 * This is motivated by security and for optimization purposes:
+		 * 	- on context switch restore, we can restore only what
+		 * 	  we use (except when regs directly readable at user
+		 * 	  level, e.g., IA-64 self-monitoring, I386 RDPMC).
+		 * 	- do not need to maintain PMC -> PMD dependencies
+		 */
+		if (unlikely(!pfm_arch_bv_test_bit(cnum, set->used_pmds))) {
+			PFM_DBG("pmd%u cannot read, because not used", cnum);
+			goto error;
+		}
+
+		val = set->pmds[cnum];
+
+		/*
+		 * If the task is not the current one, then we check if the
+		 * PMU state is still in the local live register due to lazy
+		 * ctxsw. If true, then we read directly from the registers.
+		 */
+		if (can_access_pmu) {
+			hw_val = pfm_read_pmd(ctx, cnum);
+			if (pmd_type & PFM_REG_C64)
+				val = (val & ~ovfl_mask)
+				    | (hw_val & ovfl_mask);
+			else
+				val = hw_val;
+		}
+
+		PFM_DBG("pmd%u=0x%llx ",
+			cnum,
+			(unsigned long long)val);
+
+		req->reg_value = val;
+	}
+	ret = 0;
+error:
+	return ret;
+}
diff --git a/perfmon/perfmon_syscalls.c b/perfmon/perfmon_syscalls.c
new file mode 100644
index 000000000000..5c900bb05ad9
--- /dev/null
+++ b/perfmon/perfmon_syscalls.c
@@ -0,0 +1,741 @@
+/*
+ * perfmon_syscalls.c: perfmon2 system call interface
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *                David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * 	http://perfmon2.sf.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/ptrace.h>
+#include <linux/perfmon_kern.h>
+#include <linux/uaccess.h>
+#include "perfmon_priv.h"
+
+/*
+ * Context locking rules:
+ * ---------------------
+ * 	- any thread with access to the file descriptor of a context can
+ * 	  potentially issue perfmon calls
+ *
+ * 	- calls must be serialized to guarantee correctness
+ *
+ * 	- as soon as a context is attached to a thread or CPU, it may be
+ * 	  actively monitoring. On some architectures, such as IA-64, this
+ * 	  is true even though the pfm_start() call has not been made. This
+ * 	  comes from the fact that on some architectures, it is possible to
+ * 	  start/stop monitoring from userland.
+ *
+ *	- If monitoring is active, then there can PMU interrupts. Because
+ *	  context accesses must be serialized, the perfmon system calls
+ *	  must mask interrupts as soon as the context is attached.
+ *
+ *	- perfmon system calls that operate with the context unloaded cannot
+ *	  assume it is actually unloaded when they are called. They first need
+ *	  to check and for that they need interrupts masked. Then, if the
+ *	  context is actually unloaded, they can unmask interrupts.
+ *
+ *	- interrupt masking holds true for other internal perfmon functions as
+ *	  well. Except for PMU interrupt handler because those interrupts
+ *	  cannot be nested.
+ *
+ * 	- we mask ALL interrupts instead of just the PMU interrupt because we
+ * 	  also need to protect against timer interrupts which could trigger
+ * 	  a set switch.
+ */
+
+struct pfm_syscall_cookie {
+	struct file *filp;
+	int fput_needed;
+};
+
+/*
+ * cannot attach if :
+ * 	- kernel task
+ * 	- task not owned by caller (checked by ptrace_may_attach())
+ * 	- task is dead or zombie
+ * 	- cannot use blocking notification when self-monitoring
+ */
+static int pfm_task_incompatible(struct pfm_context *ctx,
+				 struct task_struct *task)
+{
+	/*
+	 * cannot attach to a kernel thread
+	 */
+	if (!task->mm) {
+		PFM_DBG("cannot attach to kernel thread [%d]", task->pid);
+		return -EPERM;
+	}
+
+	/*
+	 * cannot attach to a zombie task
+	 */
+	if (task->exit_state == EXIT_ZOMBIE || task->exit_state == EXIT_DEAD) {
+		PFM_DBG("cannot attach to zombie/dead task [%d]", task->pid);
+		return -EBUSY;
+	}
+	return 0;
+}
+
+/**
+ * pfm_get_task -- check permission and acquire task to monitor
+ * @ctx: perfmon context
+ * @pid: identification of the task to check
+ * @task: upon return, a pointer to the task to monitor
+ *
+ * This function  is used in per-thread mode only AND when not
+ * self-monitoring. It finds the task to monitor and checks
+ * that the caller has permissions to attach. It also checks
+ * that the task is stopped via ptrace so that we can safely
+ * modify its state.
+ *
+ * task refcount is incremented when succesful.
+ */
+static int pfm_get_task(struct pfm_context *ctx, pid_t pid,
+			struct task_struct **task)
+{
+	struct task_struct *p;
+	int ret = 0, ret1 = 0;
+
+	/*
+	 * When attaching to another thread we must ensure
+	 * that the thread is actually stopped. Just like with
+	 * perfmon system calls, we enforce that the thread
+	 * be ptraced and STOPPED by using ptrace_check_attach().
+	 *
+	 * As a consequence, only the ptracing parent can actually
+	 * attach a context to a thread. Obviously, this constraint
+	 * does not exist for self-monitoring threads.
+	 *
+	 * We use ptrace_may_access() to check for permission.
+	 */
+	read_lock(&tasklist_lock);
+
+	p = find_task_by_vpid(pid);
+	if (p)
+		get_task_struct(p);
+
+	read_unlock(&tasklist_lock);
+
+	if (!p) {
+		PFM_DBG("task not found %d", pid);
+		return -ESRCH;
+	}
+
+	ret = -EPERM;
+
+	/*
+	 * returns 0 if cannot attach
+	 */
+	ret1 = ptrace_may_access(p, PTRACE_MODE_ATTACH);
+	if (ret1)
+		ret = ptrace_check_attach(p, 0);
+
+	PFM_DBG("may_attach=%d check_attach=%d", ret1, ret);
+
+	if (ret || !ret1)
+		goto error;
+
+	ret = pfm_task_incompatible(ctx, p);
+	if (ret)
+		goto error;
+
+	*task = p;
+
+	return 0;
+error:
+	if (!(ret1 || ret))
+		ret = -EPERM;
+
+	put_task_struct(p);
+
+	return ret;
+}
+
+/*
+ * context must be locked when calling this function
+ */
+int __pfm_check_task_state(struct pfm_context *ctx, int check_mask,
+			 unsigned long *flags)
+{
+	struct task_struct *task;
+	unsigned long local_flags, new_flags;
+	int state, ret;
+
+recheck:
+	/*
+	 * task is NULL for system-wide context
+	 */
+	task = ctx->task;
+	state = ctx->state;
+	local_flags = *flags;
+
+	PFM_DBG("state=%d check_mask=0x%x task=[%d]",
+		state, check_mask, task ? task->pid:-1);
+	/*
+	 * if the context is detached, then we do not touch
+	 * hardware, therefore there is not restriction on when we can
+	 * access it.
+	 */
+	if (state == PFM_CTX_UNLOADED)
+		return 0;
+	/*
+	 * no command can operate on a zombie context.
+	 * A context becomes zombie when the file that identifies
+	 * it is closed while the context is still attached to the
+	 * thread it monitors.
+	 */
+	if (state == PFM_CTX_ZOMBIE)
+		return -EINVAL;
+
+	/*
+	 * at this point, state is PFM_CTX_LOADED
+	 */
+
+	/*
+	 * some commands require the context to be unloaded to operate
+	 */
+	if (check_mask & PFM_CMD_UNLOADED)  {
+		PFM_DBG("state=%d, cmd needs context unloaded", state);
+		return -EBUSY;
+	}
+
+	/*
+	 * self-monitoring always ok.
+	 */
+	if (task == current)
+		return 0;
+
+	/*
+	 * at this point, monitoring another thread
+	 */
+
+	/*
+	 * When we operate on another thread, we must wait for it to be
+	 * stopped and completely off any CPU as we need to access the
+	 * PMU state (or machine state).
+	 *
+	 * A thread can be put in the STOPPED state in various ways
+	 * including PTRACE_ATTACH, or when it receives a SIGSTOP signal.
+	 * We enforce that the thread must be ptraced, so it is stopped
+	 * AND it CANNOT wake up while we operate on it because this
+	 * would require an action from the ptracing parent which is the
+	 * thread that is calling this function.
+	 *
+	 * The dependency on ptrace, imposes that only the ptracing
+	 * parent can issue command on a thread. This is unfortunate
+	 * but we do not know of a better way of doing this.
+	 */
+	if (check_mask & PFM_CMD_STOPPED) {
+
+		spin_unlock_irqrestore(&ctx->lock, local_flags);
+
+		/*
+		 * check that the thread is ptraced AND STOPPED
+		 */
+		ret = ptrace_check_attach(task, 0);
+
+		spin_lock_irqsave(&ctx->lock, new_flags);
+
+		/*
+		 * flags may be different than when we released the lock
+		 */
+		*flags = new_flags;
+
+		if (ret)
+			return ret;
+		/*
+		 * we must recheck to verify if state has changed
+		 */
+		if (unlikely(ctx->state != state)) {
+			PFM_DBG("old_state=%d new_state=%d",
+				state,
+				ctx->state);
+			goto recheck;
+		}
+	}
+	return 0;
+}
+
+int pfm_check_task_state(struct pfm_context *ctx, int check_mask,
+			 unsigned long *flags)
+{
+	int ret;
+	ret  = __pfm_check_task_state(ctx, check_mask, flags);
+	PFM_DBG("ret=%d",ret);
+	return ret;
+}
+
+/**
+ * pfm_get_args - Function used to copy the syscall argument into kernel memory
+ * @ureq: user argument
+ * @sz: user argument size
+ * @lsz: size of stack buffer
+ * @laddr: stack buffer address
+ * @req: point to start of kernel copy of the argument
+ * @ptr_free: address of kernel copy to free
+ *
+ * There are two options:
+ * 	- use a stack buffer described by laddr (addresses) and lsz (size)
+ * 	- allocate memory
+ *
+ * return:
+ * 	< 0 : in case of error (ptr_free may not be updated)
+ * 	  0 : success
+ *      - req: points to base of kernel copy of arguments
+ *	- ptr_free: address of buffer to free by caller on exit.
+ *		    NULL if using the stack buffer
+ *
+ * when ptr_free is not NULL upon return, the caller must kfree()
+ */
+int pfm_get_args(void __user *ureq, size_t sz, size_t lsz, void *laddr,
+		 void **req, void **ptr_free)
+{
+	void *addr;
+
+	/*
+	 * check syadmin argument limit
+	 */
+	if (unlikely(sz > pfm_controls.arg_mem_max)) {
+		PFM_DBG("argument too big %zu max=%zu",
+			sz,
+			pfm_controls.arg_mem_max);
+		return -E2BIG;
+	}
+
+	/*
+	 * check if vector fits on stack buffer
+	 */
+	if (sz > lsz) {
+		addr = kmalloc(sz, GFP_KERNEL);
+		if (unlikely(addr == NULL))
+			return -ENOMEM;
+		*ptr_free = addr;
+	} else {
+		addr = laddr;
+		*req = laddr;
+		*ptr_free = NULL;
+	}
+
+	/*
+	 * bring the data in
+	 */
+	if (unlikely(copy_from_user(addr, ureq, sz))) {
+		if (addr != laddr)
+			kfree(addr);
+		return -EFAULT;
+	}
+
+	/*
+	 * base address of kernel buffer
+	 */
+	*req = addr;
+
+	return 0;
+}
+
+/**
+ * pfm_acquire_ctx_from_fd -- get ctx from file descriptor
+ * @fd: file descriptor
+ * @ctx: pointer to pointer of context updated on return
+ * @cookie: opaque structure to use for release
+ *
+ * This helper function extracts the ctx from the file descriptor.
+ * It also increments the refcount of the file structure. Thus
+ * it updates the cookie so the refcount can be decreased when
+ * leaving the perfmon syscall via pfm_release_ctx_from_fd
+ */
+static int pfm_acquire_ctx_from_fd(int fd, struct pfm_context **ctx,
+				   struct pfm_syscall_cookie *cookie)
+{
+	struct file *filp;
+	int fput_needed;
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(filp == NULL)) {
+		PFM_DBG("invalid fd %d", fd);
+		return -EBADF;
+	}
+
+	*ctx = filp->private_data;
+
+	if (unlikely(!*ctx || filp->f_op != &pfm_file_ops)) {
+		PFM_DBG("fd %d not related to perfmon", fd);
+		return -EBADF;
+	}
+	cookie->filp = filp;
+	cookie->fput_needed = fput_needed;
+
+	return 0;
+}
+
+/**
+ * pfm_release_ctx_from_fd -- decrease refcount of file associated with context
+ * @cookie: the cookie structure initialized by pfm_acquire_ctx_from_fd
+ */
+static inline void pfm_release_ctx_from_fd(struct pfm_syscall_cookie *cookie)
+{
+	fput_light(cookie->filp, cookie->fput_needed);
+}
+
+/**
+ * pfm_validate_type_sz -- validate sz based on type
+ * @type : PFM_RW_XX type passed to pfm_write or pfm_read
+ * @sz   : vector size in bytes
+ *
+ * return:
+ *    the number of elements in the vector, 0 if error
+ */
+static size_t pfm_validate_type_sz(int type, size_t sz)
+{
+	size_t count, sz_type;
+
+	switch(type) {
+	case PFM_RW_PMD:
+	case PFM_RW_PMC:
+		sz_type = sizeof(struct pfarg_pmr);
+		break;
+	default:
+		PFM_DBG("invalid type=%d", type);
+		return 0;
+	}
+
+	count = sz / sz_type;
+
+	if ((count * sz_type) != sz) {
+		PFM_DBG("invalid size=%zu for type=%d", sz, type);
+		return 0;
+	}
+
+	PFM_DBG("sz=%zu sz_type=%zu count=%zu",
+		sz,
+		sz_type,
+		count);
+
+	return count;
+}
+
+/*
+ * unlike the other perfmon system calls, this one returns a file descriptor
+ * or a value < 0 in case of error, very much like open() or socket()
+ */
+asmlinkage long sys_pfm_create(int flags, struct pfarg_sinfo __user *ureq)
+{
+	struct pfm_context *new_ctx;
+	struct pfarg_sinfo sif;
+	int ret;
+
+	PFM_DBG("flags=0x%x sif=%p", flags, ureq);
+
+	if (perfmon_disabled)
+		return -ENOSYS;
+
+	if (flags) {
+		PFM_DBG("no flags accepted yet");
+		return -EINVAL;
+	}
+	ret = __pfm_create_context(flags, &sif, &new_ctx);
+
+	/*
+	 * copy sif to user level argument, if requested
+	 */
+	if (ureq && copy_to_user(ureq, &sif, sizeof(sif))) {
+		pfm_undo_create(ret, new_ctx);
+		ret  = -EFAULT;
+	}
+	return ret;
+}
+
+asmlinkage long sys_pfm_write(int fd, int uflags,
+			      int type,
+			      void __user *ureq,
+			      size_t sz)
+{
+	u64 buf[PFM_STK_ARG];
+	struct pfm_context *ctx;
+	struct pfm_syscall_cookie cookie;
+	void *req, *fptr;
+	unsigned long flags;
+	size_t count;
+	int ret;
+
+	PFM_DBG("fd=%d flags=0x%x type=%d req=%p sz=%zu",
+		fd, uflags, type, ureq, sz);
+
+	if (uflags) {
+		PFM_DBG("no flags defined");
+		return -EINVAL;
+	}
+
+	count = pfm_validate_type_sz(type, sz);
+	if (!count)
+		return -EINVAL;
+
+	ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie);
+	if (ret)
+		return ret;
+
+	ret = pfm_get_args(ureq, sz, sizeof(buf), buf, (void **)&req, &fptr);
+	if (ret)
+		goto error;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags);
+	if (ret)
+		goto skip;
+	switch(type) {
+	case PFM_RW_PMC:
+		ret = __pfm_write_pmcs(ctx, req, count);
+		break;
+	case PFM_RW_PMD:
+		ret = __pfm_write_pmds(ctx, req, count);
+		break;
+	default:
+		PFM_DBG("invalid type=%d", type);
+		ret = -EINVAL;
+	}
+skip:
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	/*
+	 * This function may be on the critical path.
+	 * We want to avoid the branch if unecessary.
+	 */
+	if (fptr)
+		kfree(fptr);
+error:
+	pfm_release_ctx_from_fd(&cookie);
+	return ret;
+}
+
+asmlinkage long sys_pfm_read(int fd, int uflags,
+			     int type,
+			     void __user *ureq,
+			     size_t sz)
+{
+	u64 buf[PFM_STK_ARG];
+	struct pfm_context *ctx;
+	struct pfm_syscall_cookie cookie;
+	void *req, *fptr;
+	unsigned long flags;
+	size_t count;
+	int ret;
+
+	PFM_DBG("fd=%d flags=0x%x type=%d req=%p sz=%zu",
+		fd, uflags, type, ureq, sz);
+
+	if (uflags) {
+		PFM_DBG("no flags defined");
+		return -EINVAL;
+	}
+
+	count = pfm_validate_type_sz(type, sz);
+	if (!count)
+		return -EINVAL;
+
+	ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie);
+	if (ret)
+		return ret;
+
+	ret = pfm_get_args(ureq, sz, sizeof(buf), buf, (void **)&req, &fptr);
+	if (ret)
+		goto error;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags);
+	if (ret)
+		goto skip;
+
+	switch(type) {
+	case PFM_RW_PMD:
+		ret = __pfm_read_pmds(ctx, req, count);
+		break;
+	default:
+		PFM_DBG("invalid type=%d", type);
+		ret = -EINVAL;
+	}
+skip:
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	if (copy_to_user(ureq, req, sz))
+		ret = -EFAULT;
+
+	if (fptr)
+		kfree(fptr);
+error:
+	pfm_release_ctx_from_fd(&cookie);
+	return ret;
+}
+
+asmlinkage long sys_pfm_set_state(int fd, int uflags, int state)
+{
+	struct pfm_context *ctx;
+	struct pfm_syscall_cookie cookie;
+	unsigned long flags;
+	int ret;
+
+	PFM_DBG("fd=%d uflags=0x%x state=0x%x", fd, uflags, state);
+
+	if (uflags) {
+		PFM_DBG("no flags defined");
+		return -EINVAL;
+	}
+
+	switch(state) {
+	case PFM_ST_START:
+	case PFM_ST_STOP:
+		break;
+	default:
+		PFM_DBG("invalid state=0x%x", state);
+		return -EINVAL;
+	}
+
+	ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie);
+	if (ret)
+		return ret;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags);
+	if (!ret) {
+		if (state == PFM_ST_STOP)
+			ret = __pfm_stop(ctx);
+		else
+			ret = __pfm_start(ctx);
+	}
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	pfm_release_ctx_from_fd(&cookie);
+
+	return ret;
+}
+
+static long pfm_detach(int fd, int uflags)
+{
+	struct pfm_context *ctx;
+	struct pfm_syscall_cookie cookie;
+	unsigned long flags;
+	int ret;
+
+	ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie);
+	if (ret)
+		return ret;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED|PFM_CMD_UNLOAD, &flags);
+	if (!ret)
+		ret = __pfm_unload_context(ctx);
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	/*
+	 * if unload was successful, then release the session
+	 * must be called with interrupts enabled, thus we need
+	 * to defer until are out of __pfm_unload_context()
+	 */
+	if (!ret)
+		pfm_session_release();
+
+	pfm_release_ctx_from_fd(&cookie);
+
+	return ret;
+}
+
+asmlinkage long sys_pfm_attach(int fd, int uflags, int target)
+{
+	struct pfm_context *ctx;
+	struct task_struct *task;
+	struct pfm_syscall_cookie cookie;
+	unsigned long flags;
+	int ret;
+
+	PFM_DBG("fd=%d uflags=0x%x target=%d", fd, uflags, target);
+
+	if (uflags) {
+		PFM_DBG("invalid flags");
+		return -EINVAL;
+	}
+
+	/*
+ 	 * handle detach in a separate function
+ 	 */
+	if (target == PFM_NO_TARGET)
+		return pfm_detach(fd, uflags);
+
+	ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie);
+	if (ret)
+		return ret;
+
+	task = current;
+
+	/*
+	 * in per-thread mode (not self-monitoring), get a reference
+	 * on task to monitor. This must be done with interrupts enabled
+	 * Upon succesful return, refcount on task has increased.
+	 *
+	 * fget_light() is protecting the context.
+	 */
+   	if (target != current->pid) {
+		ret = pfm_get_task(ctx, target, &task);
+		if (ret)
+			goto error;
+	}
+
+	/*
+	 * irqsave is required to avoid race in case context is already
+	 * loaded or with switch timeout in the case of self-monitoring
+	 */
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	ret = pfm_check_task_state(ctx, PFM_CMD_UNLOADED, &flags);
+	if (!ret)
+		ret = __pfm_load_context(ctx, task);
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	/*
+	 * in per-thread mode (not self-monitoring), we need
+	 * to decrease refcount on task to monitor:
+	 *   - attach successful: we have a reference in ctx->task
+	 *   - attach failed    : undo the effect of pfm_get_task()
+	 */
+	if (task != current)
+		put_task_struct(task);
+error:
+	pfm_release_ctx_from_fd(&cookie);
+	return ret;
+}
diff --git a/perfmon/perfmon_sysfs.c b/perfmon/perfmon_sysfs.c
new file mode 100644
index 000000000000..b13c12581175
--- /dev/null
+++ b/perfmon/perfmon_sysfs.c
@@ -0,0 +1,344 @@
+/*
+ * perfmon_sysfs.c: perfmon2 sysfs interface
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *                David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * 	http://perfmon2.sf.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/module.h> /* for EXPORT_SYMBOL */
+#include <linux/perfmon_kern.h>
+#include "perfmon_priv.h"
+
+struct pfm_attribute {
+	struct attribute attr;
+	ssize_t (*show)(void *, struct pfm_attribute *attr, char *);
+	ssize_t (*store)(void *, const char *, size_t);
+};
+#define to_attr(n) container_of(n, struct pfm_attribute, attr);
+
+
+#define PFM_RO_ATTR(_name, _show) \
+	struct kobj_attribute attr_##_name = __ATTR(_name, 0444, _show, NULL)
+
+#define PFM_RW_ATTR(_name, _show, _store) 			\
+	struct kobj_attribute attr_##_name = __ATTR(_name, 0644, _show, _store)
+
+#define PFM_ROS_ATTR(_name, _show) \
+	struct pfm_attribute attr_##_name = __ATTR(_name, 0444, _show, NULL)
+
+#define is_attr_name(a, n) (!strcmp((a)->attr.name, n))
+int pfm_sysfs_add_pmu(struct pfm_pmu_config *pmu);
+
+static struct kobject *pfm_kernel_kobj;
+static struct kobject *pfm_pmu_kobj;
+
+
+static ssize_t pfm_regs_attr_show(struct kobject *kobj,
+		struct attribute *attr, char *buf)
+{
+#define to_reg(n) container_of(n, struct pfm_regmap_desc, kobj)
+	struct pfm_regmap_desc *reg = to_reg(kobj);
+	struct pfm_attribute *attribute = to_attr(attr);
+	return attribute->show ? attribute->show(reg, attribute, buf) : -EIO;
+}
+
+static struct sysfs_ops pfm_regs_sysfs_ops = {
+	.show  = pfm_regs_attr_show
+};
+
+static struct kobj_type pfm_regs_ktype = {
+	.sysfs_ops = &pfm_regs_sysfs_ops,
+};
+
+static ssize_t pfm_controls_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+
+	if (is_attr_name(attr, "version"))
+		return snprintf(buf, PAGE_SIZE, "%u.%u\n",  PFM_VERSION_MAJ, PFM_VERSION_MIN);
+
+	if (is_attr_name(attr, "task_sessions_count"))
+		return pfm_sysfs_res_show(buf, PAGE_SIZE, 0);
+
+	if (is_attr_name(attr, "debug"))
+		return snprintf(buf, PAGE_SIZE, "%d\n", pfm_controls.debug);
+
+	if (is_attr_name(attr, "task_group"))
+		return snprintf(buf, PAGE_SIZE, "%d\n", pfm_controls.task_group);
+
+	if (is_attr_name(attr, "arg_mem_max"))
+		return snprintf(buf, PAGE_SIZE, "%zu\n", pfm_controls.arg_mem_max);
+
+	return 0;
+}
+
+static ssize_t pfm_controls_store(struct kobject *kobj, struct kobj_attribute *attr,
+			 	  const char *buf, size_t count)
+{
+	size_t d;
+
+	if (sscanf(buf, "%zu", &d) != 1)
+		goto skip;
+
+	if (is_attr_name(attr, "debug"))
+		pfm_controls.debug = d;
+
+	if (is_attr_name(attr, "task_group"))
+		pfm_controls.task_group = d;
+
+	if (is_attr_name(attr, "arg_mem_max")) {
+		/*
+		 * we impose a page as the minimum.
+		 *
+		 * This limit may be smaller than the stack buffer
+		 * available and that is fine.
+		 */
+		if (d >= PAGE_SIZE)
+			pfm_controls.arg_mem_max = d;
+	}
+
+skip:
+	return count;
+}
+
+/*
+ * /sys/kernel/perfmon attributes
+ */
+static PFM_RO_ATTR(version, pfm_controls_show);
+static PFM_RO_ATTR(task_sessions_count, pfm_controls_show);
+static PFM_RW_ATTR(debug, pfm_controls_show, pfm_controls_store);
+static PFM_RW_ATTR(task_group, pfm_controls_show, pfm_controls_store);
+static PFM_RW_ATTR(arg_mem_max, pfm_controls_show, pfm_controls_store);
+
+static struct attribute *pfm_kernel_attrs[] = {
+	&attr_version.attr,
+	&attr_task_sessions_count.attr,
+	&attr_debug.attr,
+	&attr_task_group.attr,
+	&attr_arg_mem_max.attr,
+	NULL
+};
+
+static struct attribute_group pfm_kernel_attr_group = {
+	.attrs = pfm_kernel_attrs,
+};
+
+/*
+ * per-reg attributes
+ */
+static ssize_t pfm_reg_show(void *data, struct pfm_attribute *attr, char *buf)
+{
+	struct pfm_regmap_desc *reg = data;
+	int w;
+
+	reg = data;
+
+	if (is_attr_name(attr, "name"))
+		return snprintf(buf, PAGE_SIZE, "%s\n", reg->desc);
+
+	if (is_attr_name(attr, "dfl_val"))
+		return snprintf(buf, PAGE_SIZE, "0x%llx\n",
+				(unsigned long long)reg->dfl_val);
+
+	if (is_attr_name(attr, "width")) {
+		w = (reg->type & PFM_REG_C64) ?
+		    pfm_pmu_conf->counter_width : 64;
+		return snprintf(buf, PAGE_SIZE, "%d\n", w);
+	}
+
+	if (is_attr_name(attr, "rsvd_msk"))
+		return snprintf(buf, PAGE_SIZE, "0x%llx\n",
+				(unsigned long long)reg->rsvd_msk);
+
+	if (is_attr_name(attr, "addr"))
+		return snprintf(buf, PAGE_SIZE, "0x%lx\n", reg->hw_addr);
+
+	return 0;
+}
+
+static PFM_ROS_ATTR(name, pfm_reg_show);
+static PFM_ROS_ATTR(dfl_val, pfm_reg_show);
+static PFM_ROS_ATTR(rsvd_msk, pfm_reg_show);
+static PFM_ROS_ATTR(width, pfm_reg_show);
+static PFM_ROS_ATTR(addr, pfm_reg_show);
+
+static struct attribute *pfm_reg_attrs[] = {
+	&attr_name.attr,
+	&attr_dfl_val.attr,
+	&attr_rsvd_msk.attr,
+	&attr_width.attr,
+	&attr_addr.attr,
+	NULL
+};
+
+static struct attribute_group pfm_reg_attr_group = {
+	.attrs = pfm_reg_attrs,
+};
+
+static ssize_t pfm_pmu_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+	if (is_attr_name(attr, "model"))
+		return snprintf(buf, PAGE_SIZE, "%s\n", pfm_pmu_conf->pmu_name);
+	return 0;
+}
+
+static PFM_RO_ATTR(model, pfm_pmu_show);
+
+static struct attribute *pfm_pmu_desc_attrs[] = {
+	&attr_model.attr,
+	NULL
+};
+
+static struct attribute_group pfm_pmu_desc_attr_group = {
+	.attrs = pfm_pmu_desc_attrs,
+};
+
+static int pfm_sysfs_add_pmu_regs(struct pfm_pmu_config *pmu)
+{
+	struct pfm_regmap_desc *reg;
+	unsigned int i, k;
+	int ret;
+
+	reg = pmu->pmc_desc;
+	for (i = 0; i < pmu->num_pmc_entries; i++, reg++) {
+
+		if (!(reg->type & PFM_REG_I))
+			continue;
+
+		ret = kobject_init_and_add(&reg->kobj, &pfm_regs_ktype,
+					   pfm_pmu_kobj, "pmc%u", i);
+		if (ret)
+			goto undo_pmcs;
+
+		ret = sysfs_create_group(&reg->kobj, &pfm_reg_attr_group);
+		if (ret) {
+			kobject_del(&reg->kobj);
+			goto undo_pmcs;
+		}
+	}
+
+	reg = pmu->pmd_desc;
+	for (i = 0; i < pmu->num_pmd_entries; i++, reg++) {
+
+		if (!(reg->type & PFM_REG_I))
+			continue;
+
+		ret = kobject_init_and_add(&reg->kobj, &pfm_regs_ktype,
+					   pfm_pmu_kobj, "pmd%u", i);
+		if (ret)
+			goto undo_pmds;
+
+		ret = sysfs_create_group(&reg->kobj, &pfm_reg_attr_group);
+		if (ret) {
+			kobject_del(&reg->kobj);
+			goto undo_pmds;
+		}
+	}
+	return 0;
+undo_pmds:
+	reg = pmu->pmd_desc;
+	for (k = 0; k < i; k++, reg++) {
+		if (!(reg->type & PFM_REG_I))
+			continue;
+		sysfs_remove_group(&reg->kobj, &pfm_reg_attr_group);
+		kobject_del(&reg->kobj);
+	}
+	i = pmu->num_pmc_entries;
+	/* fall through */
+undo_pmcs:
+	reg = pmu->pmc_desc;
+	for (k = 0; k < i; k++, reg++) {
+		if (!(reg->type & PFM_REG_I))
+			continue;
+		sysfs_remove_group(&reg->kobj, &pfm_reg_attr_group);
+		kobject_del(&reg->kobj);
+	}
+	return ret;
+}
+
+/*
+ * when a PMU description module is inserted, we create
+ * a pmu_desc subdir in sysfs and we populate it with
+ * PMU specific information, such as register mappings
+ */
+int pfm_sysfs_add_pmu(struct pfm_pmu_config *pmu)
+{
+	int ret;
+
+	pfm_pmu_kobj = kobject_create_and_add("pmu_desc", pfm_kernel_kobj);
+	if (!pfm_pmu_kobj)
+		return -ENOMEM;
+
+	ret = sysfs_create_group(pfm_pmu_kobj, &pfm_pmu_desc_attr_group);
+	if (ret) {
+		/* will release pfm_pmu_kobj */
+		kobject_put(pfm_pmu_kobj);
+		return ret;
+	}
+
+	ret = pfm_sysfs_add_pmu_regs(pmu);
+	if (ret) {
+		sysfs_remove_group(pfm_pmu_kobj, &pfm_pmu_desc_attr_group);
+		/* will release pfm_pmu_kobj */
+		kobject_put(pfm_pmu_kobj);
+	} else
+		kobject_uevent(pfm_pmu_kobj, KOBJ_ADD);
+
+	return ret;
+}
+
+int __init pfm_init_sysfs(void)
+{
+	int ret;
+
+	/*
+ 	 * dynamic allocation happens on pfm_kernel_kobj,
+ 	 * but a release callback is attached
+ 	 */
+	pfm_kernel_kobj = kobject_create_and_add("perfmon", kernel_kobj);
+	if (!pfm_kernel_kobj) {
+		PFM_ERR("cannot add kernel object");
+		return -ENOMEM;
+	}
+
+	ret = sysfs_create_group(pfm_kernel_kobj, &pfm_kernel_attr_group);
+	if (ret) {
+		kobject_put(pfm_kernel_kobj);
+		return ret;
+	}
+
+	if (pfm_pmu_conf)
+		pfm_sysfs_add_pmu(pfm_pmu_conf);
+
+	return 0;
+}
author	Stephen Rothwell <sfr@canb.auug.org.au>	2008-11-11 18:00:33 +1100
committer	Stephen Rothwell <sfr@canb.auug.org.au>	2008-11-11 18:00:33 +1100
commit	6bfea1858872c1e57d94d686e3144bfa10ca48cb (patch)
tree	cfe7ad66c1f9a14f9d419c3ebf3100264b0044d4
parent	bc6435afdc6a1e0c5236a4a031f372bc1c62341d (diff)
parent	4872c7055867a9b583c76cd7744030dd515a5f35 (diff)