summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2008-11-11 18:00:33 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2008-11-11 18:00:33 +1100
commit6bfea1858872c1e57d94d686e3144bfa10ca48cb (patch)
treecfe7ad66c1f9a14f9d419c3ebf3100264b0044d4
parentbc6435afdc6a1e0c5236a4a031f372bc1c62341d (diff)
parent4872c7055867a9b583c76cd7744030dd515a5f35 (diff)
Merge commit 'perfmon3/master'
-rw-r--r--Documentation/ABI/testing/sysfs-perfmon42
-rw-r--r--Documentation/ABI/testing/sysfs-perfmon-pmu48
-rw-r--r--Documentation/perfmon.txt206
-rw-r--r--Makefile2
-rw-r--r--arch/ia64/Kconfig2
-rw-r--r--arch/ia64/configs/bigsur_defconfig2
-rw-r--r--arch/ia64/configs/generic_defconfig2
-rw-r--r--arch/ia64/configs/gensparse_defconfig2
-rw-r--r--arch/ia64/configs/sim_defconfig2
-rw-r--r--arch/ia64/configs/tiger_defconfig2
-rw-r--r--arch/ia64/configs/zx1_defconfig2
-rw-r--r--arch/ia64/include/asm/processor.h2
-rw-r--r--arch/ia64/include/asm/system.h2
-rw-r--r--arch/ia64/kernel/Makefile2
-rw-r--r--arch/ia64/kernel/irq_ia64.c4
-rw-r--r--arch/ia64/kernel/perfmon.c6
-rw-r--r--arch/ia64/kernel/process.c16
-rw-r--r--arch/ia64/kernel/ptrace.c4
-rw-r--r--arch/ia64/kernel/smpboot.c4
-rw-r--r--arch/ia64/lib/Makefile2
-rw-r--r--arch/ia64/oprofile/Makefile2
-rw-r--r--arch/ia64/oprofile/init.c4
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/Makefile3
-rw-r--r--arch/x86/ia32/ia32entry.S5
-rw-r--r--arch/x86/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/irq_vectors.h5
-rw-r--r--arch/x86/include/asm/mach-default/entry_arch.h4
-rw-r--r--arch/x86/include/asm/perfmon.h34
-rw-r--r--arch/x86/include/asm/perfmon_kern.h438
-rw-r--r--arch/x86/include/asm/thread_info.h8
-rw-r--r--arch/x86/include/asm/unistd_32.h5
-rw-r--r--arch/x86/include/asm/unistd_64.h11
-rw-r--r--arch/x86/kernel/entry_32.S2
-rw-r--r--arch/x86/kernel/entry_64.S8
-rw-r--r--arch/x86/kernel/irqinit_64.c5
-rw-r--r--arch/x86/kernel/process_32.c10
-rw-r--r--arch/x86/kernel/process_64.c10
-rw-r--r--arch/x86/kernel/signal_32.c5
-rw-r--r--arch/x86/kernel/signal_64.c5
-rw-r--r--arch/x86/kernel/syscall_table_32.S5
-rw-r--r--arch/x86/oprofile/nmi_int.c10
-rw-r--r--arch/x86/perfmon/Kconfig33
-rw-r--r--arch/x86/perfmon/Makefile7
-rw-r--r--arch/x86/perfmon/perfmon.c619
-rw-r--r--arch/x86/perfmon/perfmon_amd64.c483
-rw-r--r--arch/x86/perfmon/perfmon_intel_arch.c628
-rw-r--r--include/linux/perfmon.h102
-rw-r--r--include/linux/perfmon_kern.h285
-rw-r--r--include/linux/perfmon_pmu.h138
-rw-r--r--include/linux/sched.h4
-rw-r--r--include/linux/syscalls.h11
-rw-r--r--kernel/sys_ni.c7
-rw-r--r--perfmon/Makefile10
-rw-r--r--perfmon/perfmon_activate.c136
-rw-r--r--perfmon/perfmon_attach.c337
-rw-r--r--perfmon/perfmon_ctx.c400
-rw-r--r--perfmon/perfmon_ctxsw.c252
-rw-r--r--perfmon/perfmon_file.c306
-rw-r--r--perfmon/perfmon_init.c87
-rw-r--r--perfmon/perfmon_intr.c295
-rw-r--r--perfmon/perfmon_pmu.c269
-rw-r--r--perfmon/perfmon_priv.h131
-rw-r--r--perfmon/perfmon_res.c223
-rw-r--r--perfmon/perfmon_rw.c449
-rw-r--r--perfmon/perfmon_syscalls.c741
-rw-r--r--perfmon/perfmon_sysfs.c344
67 files changed, 7196 insertions, 37 deletions
diff --git a/Documentation/ABI/testing/sysfs-perfmon b/Documentation/ABI/testing/sysfs-perfmon
new file mode 100644
index 000000000000..79c66b59ec5b
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-perfmon
@@ -0,0 +1,42 @@
+What: /sys/kernel/perfmon
+Date: Oct 2008
+KernelVersion: 2.6.27
+Contact: eranian@gmail.com
+
+Description: provide the configuration interface for the perfmon subsystems.
+ The tree contains information about the detected hardware,
+ current state of the subsystem as well as some configuration
+ parameters.
+
+ The tree consists of the following entries:
+
+ /sys/kernel/perfmon/debug (read-write):
+
+ Enable perfmon debugging output. The traces are rate-limited
+ to avoid flooding the console. It is possible to change the
+ throttling via /proc/sys/kernel/printk_ratelimit.
+
+ The value is interpreted as a bitmask. Each bit enables a
+ particular type of debug messages. Refer to the file
+ include/linux/perfmon_kern.h for more information.
+
+ /sys/kernel/perfmon/task_group (read-write):
+
+ Users group allowed to create a per-thread context (session).
+ -1 means any group.
+
+ /sys/kernel/perfmon/task_sessions_count (read-only):
+
+ Number of per-thread contexts (sessions) currently attached
+ to threads.
+
+ /sys/kernel/perfmon/version (read-only):
+
+ Perfmon interface revision number.
+
+ /sys/kernel/perfmon/arg_mem_max(read-write):
+
+ Maximum size of vector arguments expressed in bytes.
+ It can be modified but must be at least a page.
+ Default: PAGE_SIZE
+
diff --git a/Documentation/ABI/testing/sysfs-perfmon-pmu b/Documentation/ABI/testing/sysfs-perfmon-pmu
new file mode 100644
index 000000000000..2fa5a7ca8e8b
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-perfmon-pmu
@@ -0,0 +1,48 @@
+What: /sys/kernel/perfmon/pmu
+Date: Nov 2007
+KernelVersion: 2.6.24
+Contact: eranian@gmail.com
+
+Description: Provides information about the active PMU description
+ module. The module contains the mapping of the actual
+ performance counter registers onto the logical PMU exposed by
+ perfmon. There is at most one PMU description module loaded
+ at any time.
+
+ The sysfs PMU tree provides a description of the mapping for
+ each register. There is one subdir per config and data register
+ along an entry for the name of the PMU model.
+
+ The entries are as follows:
+
+ /sys/kernel/perfmon/pmu_desc/model (read-only):
+
+ Name of the PMU model is clear text and zero terminated.
+
+ Then, for each logical PMU register, XX, gets a subtree with the
+ following entries:
+
+ /sys/kernel/perfmon/pmu_desc/pm*XX/addr (read-only):
+
+ The physical address or index of the actual underlying hardware
+ register. On Itanium, it corresponds to the index. But on X86
+ processor, this is the actual MSR address.
+
+ /sys/kernel/perfmon/pmu_desc/pm*XX/dfl_val (read-only):
+
+ The default value of the register in hexadecimal.
+
+ /sys/kernel/perfmon/pmu_desc/pm*XX/name (read-only):
+
+ The name of the hardware register.
+
+ /sys/kernel/perfmon/pmu_desc/pm*XX/rsvd_msk (read-only):
+
+ Bitmask of reserved bits, i.e., bits which cannot be changed
+ by applications. When a bit is set, it means the corresponding
+ bit in the actual register is reserved.
+
+ /sys/kernel/perfmon/pmu_desc/pm*XX/width (read-only):
+
+ The width in bits of the registers. This field is only
+ relevant for counter registers.
diff --git a/Documentation/perfmon.txt b/Documentation/perfmon.txt
new file mode 100644
index 000000000000..818c53770e8b
--- /dev/null
+++ b/Documentation/perfmon.txt
@@ -0,0 +1,206 @@
+ The perfmon hardware monitoring interface
+ ------------------------------------------
+ Stephane Eranian
+ <eranian@gmail.com>
+
+I/ Introduction
+
+ The perfmon interface provides access to the hardware performance counters
+ of major processors. Nowadays, all processors implement some flavor of
+ performance counters which capture micro-architectural level information
+ such as the number of elapsed cycles, number of cache misses, and so on.
+
+ The interface is implemented as a set of new system calls and a set of
+ config files in /sys.
+
+ It is possible to monitor a single thread or a CPU. In either mode,
+ applications can count or sample. System-wide monitoring is supported by
+ running a monitoring session on each CPU. The interface supports event-based
+ sampling where the sampling period is expressed as the number of occurrences
+ of event, instead of just a timeout. This approach provides a better
+ granularity and flexibility.
+
+ For performance reason, it is possible to use a kernel-level sampling buffer
+ to minimize the overhead incurred by sampling. The format of the buffer,
+ what is recorded, how it is recorded, and how it is exported to user is
+ controlled by a kernel module called a sampling format. The current
+ implementation comes with a default format but it is possible to create
+ additional formats. There is an kernel registration interface for formats.
+ Each format is identified by a simple string which a tool can pass when a
+ monitoring session is created.
+
+ The interface also provides support for event set and multiplexing to work
+ around hardware limitations in the number of available counters or in how
+ events can be combined. Each set defines as many counters as the hardware
+ can support. The kernel then multiplexes the sets. The interface supports
+ time-based switching but also overflow-based switching, i.e., after n
+ overflows of designated counters.
+
+ Applications never manipulates the actual performance counter registers.
+ Instead they see a logical Performance Monitoring Unit (PMU) composed of a
+ set of config registers (PMC) and a set of data registers (PMD). Note that
+ PMD are not necessarily counters, they can be buffers. The logical PMU is
+ then mapped onto the actual PMU using a mapping table which is implemented
+ as a kernel module. The mapping is chosen once for each new processor. It is
+ visible in /sys/kernel/perfmon/pmu_desc. The kernel module is automatically
+ loaded on first use.
+
+ A monitoring session is uniquely identified by a file descriptor obtained
+ when the session is created. File sharing semantics apply to access the
+ session inside a process. A session is never inherited across fork. The file
+ descriptor can be used to receive counter overflow notifications or when the
+ sampling buffer is full. It is possible to use poll/select on the descriptor
+ to wait for notifications from multiple sessions. Similarly, the descriptor
+ supports asynchronous notifications via SIGIO.
+
+ Counters are always exported as being 64-bit wide regardless of what the
+ underlying hardware implements.
+
+II/ Kernel compilation
+
+ To enable perfmon, you need to enable CONFIG_PERFMON and also some of the
+ model-specific PMU modules.
+
+III/ OProfile interactions
+
+ The set of features offered by perfmon is rich enough to support migrating
+ Oprofile on top of it. That means that PMU programming and low-level
+ interrupt handling could be done by perfmon. The Oprofile sampling buffer
+ management code in the kernel as well as how samples are exported to users
+ could remain through the use of a sampling format. This is how Oprofile
+ works on Itanium.
+
+ The current interactions with Oprofile are:
+ - on X86: Both subsystems can be compiled into the same kernel. There
+ is enforced mutual exclusion between the two subsystems. When
+ there is an Oprofile session, no perfmon session can exist
+ and vice-versa.
+
+ - On IA-64: Oprofile works on top of perfmon. Oprofile being a
+ system-wide monitoring tool, the regular per-thread vs.
+ system-wide session restrictions apply.
+
+ - on PPC: no integration yet. Only one subsystem can be enabled.
+ - on MIPS: no integration yet. Only one subsystem can be enabled.
+
+IV/ User tools
+
+ We have released a simple monitoring tool to demonstrate the features of
+ the interface. The tool is called pfmon and it comes with a simple helper
+ library called libpfm. The library comes with a set of examples to show
+ how to use the kernel interface. Visit http://perfmon2.sf.net for details.
+
+ There maybe other tools available for perfmon.
+
+V/ How to program?
+
+ The best way to learn how to program perfmon, is to take a look at the
+ source code for the examples in libpfm. The source code is available from:
+
+ http://perfmon2.sf.net
+
+VI/ System calls overview
+
+ In this section, we describe the state of the interface as submitted to the
+ kernel. There are more extensions available, and we will update the section
+ as they get implemented in the upstream kernel.
+
+ The interface is implemented by the following system calls:
+
+ * int pfm_create(int flags, pfarg_sinfo_t *s);
+
+ This function creates a perfmon per-thread session.
+ The flags parameter is currently unused and must be set to 0.
+
+ Upon return and if s is not NULL, the kernel return the list of available
+ PMC and PMD registers. Tools should not assume, they have access to the
+ entire PMU, it may be shared with other kernel subsystems, e.g., on X86
+ the NMI watchdog timer.
+
+ The function returns the file descriptor identifying the session.
+
+ * int pfm_write(int fd, int flags, int type, void *d, size_t sz)
+
+ This function is used to write PMU registers for the session identified
+ by fd.
+
+ The flags parameter is currently unused and must be set to 0.
+
+ The type reflects the type of registers to write and determines the type
+ of the d parameter. The following types are defined:
+
+ - PFM_RW_PMC: write PMC registers, expect pfarg_pmr_t pointer for d
+ - PFM_RW_PMD: write PMD registers, expect pfarg_pmr_t pointer for d
+
+ The type field is not a bitmask, only one type can be passed per call.
+
+ the sz parameter describes the size of the vector of elements passed in d.
+
+ * int pfm_read(int fd, int flags, int type, void *d, size_t sz);
+
+ This function is used to read PMU registers for the session identified
+ by fd.
+
+ This function is used to write PMU registers for the session identified
+ by fd.
+
+ The flags parameter is currently unused and must be set to 0.
+
+ The type reflects the type of registers to write and determines the type
+ of the d parameter. The following types are supported:
+
+ - PFM_RW_PMD: write PMD registers, expect pfarg_pmr_t pointer for d
+
+ The type field is not a bitmask, only one type can be passed per call.
+
+ Reading of PMC registers is not allowed.
+
+ the sz parameter describes the size of the vector of elements passed in d.
+
+
+ * int pfm_attach(int fd, int flags, int target);
+
+ This function is used to attach and detach the session to and from
+ thread.
+
+ To attach the thread is identified by target which must have the
+ value returned by gettid() (not pthread_self). For a single threaded
+ process, that value is equal to the value returned by getpid().
+
+ To detach, the special target PFM_NO_TARGET must be passed.
+
+ The flags parameter is currently unused and must be set to 0.
+
+ The session is always attached as stopped, i.e., with monitoring
+ inactive. Monitoring is always stopped as a consequence of detaching.
+
+ * int pfm_set_state(int fd, int flags, int state);
+
+ The function is used to set the running state of the session. The state to
+ go to is indicated by state.
+
+ The following states are defined, only one can be specified at a time:
+
+ - PFM_ST_START: start monitoring
+ - PFM_ST_STOP: stop monitoring
+
+ The flags parameter is currently unused and must be set to 0.
+
+ * int close(int fd)
+
+ To destroy a session, the regular close() system call is used.
+
+
+VII/ /sys interface overview
+
+ Refer to Documentation/ABI/testing/sysfs-perfmon-* for a detailed
+ description of the sysfs interface of perfmon2.
+
+VIII/ debugfs interface overview
+
+ Refer to Documentation/perfmon-debugfs.txt for a detailed description of the
+ debug and statistics interface of perfmon.
+
+IX/ Documentation
+
+ Visit http://perfmon2.sf.net
diff --git a/Makefile b/Makefile
index 7f9ff9bf1544..b14977d28eab 100644
--- a/Makefile
+++ b/Makefile
@@ -621,6 +621,8 @@ export mod_strip_cmd
ifeq ($(KBUILD_EXTMOD),)
core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/
+core-$(CONFIG_PERFMON) += perfmon/
+
vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
$(core-y) $(core-m) $(drivers-y) $(drivers-m) \
$(net-y) $(net-m) $(libs-y) $(libs-m)))
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 6bd91ed7cd03..ad604df6a2b6 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -505,7 +505,7 @@ config COMPAT_FOR_U64_ALIGNMENT
config IA64_MCA_RECOVERY
tristate "MCA recovery from errors other than TLB."
-config PERFMON
+config PERFMON_V20
bool "Performance monitor support"
help
Selects whether support for the IA-64 performance monitor hardware
diff --git a/arch/ia64/configs/bigsur_defconfig b/arch/ia64/configs/bigsur_defconfig
index 6dd8655664f3..2c04fbe6c414 100644
--- a/arch/ia64/configs/bigsur_defconfig
+++ b/arch/ia64/configs/bigsur_defconfig
@@ -134,7 +134,7 @@ CONFIG_ARCH_SPARSEMEM_ENABLE=y
CONFIG_IA32_SUPPORT=y
CONFIG_COMPAT=y
# CONFIG_IA64_MCA_RECOVERY is not set
-CONFIG_PERFMON=y
+CONFIG_PERFMON_V20=y
CONFIG_IA64_PALINFO=y
#
diff --git a/arch/ia64/configs/generic_defconfig b/arch/ia64/configs/generic_defconfig
index e05f9e1d3faa..7d89a19fc8b3 100644
--- a/arch/ia64/configs/generic_defconfig
+++ b/arch/ia64/configs/generic_defconfig
@@ -209,7 +209,7 @@ CONFIG_IA32_SUPPORT=y
CONFIG_COMPAT=y
CONFIG_COMPAT_FOR_U64_ALIGNMENT=y
CONFIG_IA64_MCA_RECOVERY=y
-CONFIG_PERFMON=y
+CONFIG_PERFMON_V20=y
CONFIG_IA64_PALINFO=y
# CONFIG_IA64_MC_ERR_INJECT is not set
CONFIG_SGI_SN=y
diff --git a/arch/ia64/configs/gensparse_defconfig b/arch/ia64/configs/gensparse_defconfig
index e86fbd39c795..5f8c7721e29a 100644
--- a/arch/ia64/configs/gensparse_defconfig
+++ b/arch/ia64/configs/gensparse_defconfig
@@ -142,7 +142,7 @@ CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
CONFIG_IA32_SUPPORT=y
CONFIG_COMPAT=y
CONFIG_IA64_MCA_RECOVERY=y
-CONFIG_PERFMON=y
+CONFIG_PERFMON_V20=y
CONFIG_IA64_PALINFO=y
CONFIG_SGI_SN=y
diff --git a/arch/ia64/configs/sim_defconfig b/arch/ia64/configs/sim_defconfig
index 546a772f438e..d51457af7ca6 100644
--- a/arch/ia64/configs/sim_defconfig
+++ b/arch/ia64/configs/sim_defconfig
@@ -133,7 +133,7 @@ CONFIG_ARCH_SPARSEMEM_ENABLE=y
CONFIG_IA32_SUPPORT=y
CONFIG_COMPAT=y
# CONFIG_IA64_MCA_RECOVERY is not set
-# CONFIG_PERFMON is not set
+# CONFIG_PERFMON_V20 is not set
CONFIG_IA64_PALINFO=m
#
diff --git a/arch/ia64/configs/tiger_defconfig b/arch/ia64/configs/tiger_defconfig
index c522edf23c62..318d846ab253 100644
--- a/arch/ia64/configs/tiger_defconfig
+++ b/arch/ia64/configs/tiger_defconfig
@@ -156,7 +156,7 @@ CONFIG_VIRTUAL_MEM_MAP=y
CONFIG_HOLES_IN_ZONE=y
# CONFIG_IA32_SUPPORT is not set
CONFIG_IA64_MCA_RECOVERY=y
-CONFIG_PERFMON=y
+CONFIG_PERFMON_V20=y
CONFIG_IA64_PALINFO=y
# CONFIG_IA64_MC_ERR_INJECT is not set
# CONFIG_IA64_ESI is not set
diff --git a/arch/ia64/configs/zx1_defconfig b/arch/ia64/configs/zx1_defconfig
index 0a06b1333c95..2bf0ad40398f 100644
--- a/arch/ia64/configs/zx1_defconfig
+++ b/arch/ia64/configs/zx1_defconfig
@@ -153,7 +153,7 @@ CONFIG_HOLES_IN_ZONE=y
CONFIG_IA32_SUPPORT=y
CONFIG_COMPAT=y
CONFIG_IA64_MCA_RECOVERY=y
-CONFIG_PERFMON=y
+CONFIG_PERFMON_V20=y
CONFIG_IA64_PALINFO=y
# CONFIG_IA64_ESI is not set
# CONFIG_KEXEC is not set
diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h
index f88fa054d01d..3ecf7e0b44cb 100644
--- a/arch/ia64/include/asm/processor.h
+++ b/arch/ia64/include/asm/processor.h
@@ -321,7 +321,7 @@ struct thread_struct {
#else
# define INIT_THREAD_IA32
#endif /* CONFIG_IA32_SUPPORT */
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
void *pfm_context; /* pointer to detailed PMU context */
unsigned long pfm_needs_checking; /* when >0, pending perfmon work on kernel exit */
# define INIT_THREAD_PM .pfm_context = NULL, \
diff --git a/arch/ia64/include/asm/system.h b/arch/ia64/include/asm/system.h
index 927a381c20ca..387e54030af1 100644
--- a/arch/ia64/include/asm/system.h
+++ b/arch/ia64/include/asm/system.h
@@ -224,7 +224,7 @@ extern void ia64_account_on_switch (struct task_struct *prev, struct task_struct
# define IA64_ACCOUNT_ON_SWITCH(p,n)
#endif
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
DECLARE_PER_CPU(unsigned long, pfm_syst_info);
# define PERFMON_IS_SYSWIDE() (__get_cpu_var(pfm_syst_info) & 0x1)
#else
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index c381ea954892..93819cca7d96 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -23,7 +23,7 @@ obj-$(CONFIG_IOSAPIC) += iosapic.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_SMP) += smp.o smpboot.o
obj-$(CONFIG_NUMA) += numa.o
-obj-$(CONFIG_PERFMON) += perfmon_default_smpl.o
+obj-$(CONFIG_PERFMON_V20) += perfmon_default_smpl.o
obj-$(CONFIG_IA64_CYCLONE) += cyclone.o
obj-$(CONFIG_CPU_FREQ) += cpufreq/
obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o
diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c
index 28d3d483db92..db54bd497cf6 100644
--- a/arch/ia64/kernel/irq_ia64.c
+++ b/arch/ia64/kernel/irq_ia64.c
@@ -40,7 +40,7 @@
#include <asm/system.h>
#include <asm/tlbflush.h>
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
# include <asm/perfmon.h>
#endif
@@ -660,7 +660,7 @@ init_IRQ (void)
}
#endif
#endif
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
pfm_init_percpu();
#endif
platform_irq_init();
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 0e499757309b..5f6efcfa2de4 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -52,7 +52,7 @@
#include <asm/uaccess.h>
#include <asm/delay.h>
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
/*
* perfmon context state
*/
@@ -6831,10 +6831,10 @@ pfm_inherit(struct task_struct *task, struct pt_regs *regs)
* the psr bits are already set properly in copy_threads()
*/
}
-#else /* !CONFIG_PERFMON */
+#else /* !CONFIG_PERFMON_v20 */
asmlinkage long
sys_perfmonctl (int fd, int cmd, void *arg, int count)
{
return -ENOSYS;
}
-#endif /* CONFIG_PERFMON */
+#endif /* CONFIG_PERFMON_V20 */
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index c57162705147..afbf1a8205ee 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -46,7 +46,7 @@
#include "entry.h"
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
# include <asm/perfmon.h>
#endif
@@ -174,7 +174,7 @@ do_notify_resume_user(sigset_t *unused, struct sigscratch *scr, long in_syscall)
return;
}
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
if (current->thread.pfm_needs_checking)
/*
* Note: pfm_handle_work() allow us to call it with interrupts
@@ -334,14 +334,14 @@ cpu_idle (void)
void
ia64_save_extra (struct task_struct *task)
{
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
unsigned long info;
#endif
if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0)
ia64_save_debug_regs(&task->thread.dbr[0]);
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0)
pfm_save_regs(task);
@@ -359,14 +359,14 @@ ia64_save_extra (struct task_struct *task)
void
ia64_load_extra (struct task_struct *task)
{
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
unsigned long info;
#endif
if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0)
ia64_load_debug_regs(&task->thread.dbr[0]);
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0)
pfm_load_regs(task);
@@ -523,7 +523,7 @@ copy_thread (int nr, unsigned long clone_flags,
}
#endif
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
if (current->thread.pfm_context)
pfm_inherit(p, child_ptregs);
#endif
@@ -735,7 +735,7 @@ exit_thread (void)
{
ia64_drop_fpu(current);
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
/* if needed, stop monitoring and flush state to perfmon context */
if (current->thread.pfm_context)
pfm_exit_thread(current);
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 92c9689b7d97..ffd212fd2d36 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -31,7 +31,7 @@
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/unwind.h>
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
#include <asm/perfmon.h>
#endif
@@ -2105,7 +2105,7 @@ access_uarea(struct task_struct *child, unsigned long addr,
"address 0x%lx\n", addr);
return -1;
}
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
/*
* Check if debug registers are used by perfmon. This
* test must be done once we know that we can do the
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 1dcbb85fc4ee..f865315a9248 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -381,7 +381,7 @@ smp_callin (void)
extern void ia64_init_itm(void);
extern volatile int time_keeper_id;
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
extern void pfm_init_percpu(void);
#endif
@@ -411,7 +411,7 @@ smp_callin (void)
ia64_mca_cmc_vector_setup(); /* Setup vector on AP */
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
pfm_init_percpu();
#endif
diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile
index 98771e2a78af..754f4153123e 100644
--- a/arch/ia64/lib/Makefile
+++ b/arch/ia64/lib/Makefile
@@ -13,7 +13,7 @@ lib-y := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \
obj-$(CONFIG_ITANIUM) += copy_page.o copy_user.o memcpy.o
obj-$(CONFIG_MCKINLEY) += copy_page_mck.o memcpy_mck.o
-lib-$(CONFIG_PERFMON) += carta_random.o
+lib-$(CONFIG_PERFMON_V20) += carta_random.o
AFLAGS___divdi3.o =
AFLAGS___udivdi3.o = -DUNSIGNED
diff --git a/arch/ia64/oprofile/Makefile b/arch/ia64/oprofile/Makefile
index aad27a718ee0..3323fd5a46e9 100644
--- a/arch/ia64/oprofile/Makefile
+++ b/arch/ia64/oprofile/Makefile
@@ -7,4 +7,4 @@ DRIVER_OBJS := $(addprefix ../../../drivers/oprofile/, \
timer_int.o )
oprofile-y := $(DRIVER_OBJS) init.o backtrace.o
-oprofile-$(CONFIG_PERFMON) += perfmon.o
+oprofile-$(CONFIG_PERFMON_V20) += perfmon.o
diff --git a/arch/ia64/oprofile/init.c b/arch/ia64/oprofile/init.c
index 31b545c35460..9ed2bc152fba 100644
--- a/arch/ia64/oprofile/init.c
+++ b/arch/ia64/oprofile/init.c
@@ -20,7 +20,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops)
{
int ret = -ENODEV;
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
/* perfmon_init() can fail, but we have no way to report it */
ret = perfmon_init(ops);
#endif
@@ -32,7 +32,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops)
void oprofile_arch_exit(void)
{
-#ifdef CONFIG_PERFMON
+#ifdef CONFIG_PERFMON_V20
perfmon_exit();
#endif
}
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b5e714373385..cdc53491c033 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1516,6 +1516,8 @@ config CMDLINE_OVERRIDE
This is used to work around broken boot loaders. This should
be set to 'N' under normal conditions.
+source "arch/x86/perfmon/Kconfig"
+
endmenu
config ARCH_ENABLE_MEMORY_HOTPLUG
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index cf72b569db41..f3af2b0b4f15 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -155,6 +155,9 @@ core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
core-y += arch/x86/kernel/
core-y += arch/x86/mm/
+# perfmon support
+core-$(CONFIG_PERFMON) += arch/x86/perfmon/
+
# Remaining sub architecture files
core-y += $(mcore-y)
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 256b00b61892..891af3e6b3a6 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -826,4 +826,9 @@ ia32_sys_call_table:
.quad sys_dup3 /* 330 */
.quad sys_pipe2
.quad sys_inotify_init1
+ .quad sys_pfm_create
+ .quad sys_pfm_write
+ .quad sys_pfm_read /* 335 */
+ .quad sys_pfm_attach
+ .quad sys_pfm_set_state
ia32_syscall_end:
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 4a8e80cdcfa5..15d495f73485 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -10,6 +10,7 @@ header-y += ptrace-abi.h
header-y += sigcontext32.h
header-y += ucontext.h
header-y += processor-flags.h
+header-y += perfmon.h
unifdef-y += e820.h
unifdef-y += ist.h
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 0005adb0f941..0ba6dd3aa24e 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -87,6 +87,11 @@
#define LOCAL_TIMER_VECTOR 0xef
/*
+ * Perfmon PMU interrupt vector
+ */
+#define LOCAL_PERFMON_VECTOR 0xee
+
+/*
* First APIC vector available to drivers: (vectors 0x30-0xee) we
* start at 0x31(0x41) to spread out vectors evenly between priority
* levels. (0x80 is the syscall vector)
diff --git a/arch/x86/include/asm/mach-default/entry_arch.h b/arch/x86/include/asm/mach-default/entry_arch.h
index 6b1add8e31dd..e940722dc1f0 100644
--- a/arch/x86/include/asm/mach-default/entry_arch.h
+++ b/arch/x86/include/asm/mach-default/entry_arch.h
@@ -33,4 +33,8 @@ BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
#endif
+#ifdef CONFIG_PERFMON
+BUILD_INTERRUPT(pmu_interrupt,LOCAL_PERFMON_VECTOR)
+#endif
+
#endif
diff --git a/arch/x86/include/asm/perfmon.h b/arch/x86/include/asm/perfmon.h
new file mode 100644
index 000000000000..906f4b24cf0c
--- /dev/null
+++ b/arch/x86/include/asm/perfmon.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This file contains i386/x86_64 specific definitions for the perfmon
+ * interface.
+ *
+ * This file MUST never be included directly. Use linux/perfmon.h.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#ifndef _ASM_X86_PERFMON__H_
+#define _ASM_X86_PERFMON__H_
+
+/*
+ * arch-specific user visible interface definitions
+ */
+
+#define PFM_ARCH_MAX_PMCS (256+64) /* 256 HW 64 SW */
+#define PFM_ARCH_MAX_PMDS (256+64) /* 256 HW 64 SW */
+
+#endif /* _ASM_X86_PERFMON_H_ */
diff --git a/arch/x86/include/asm/perfmon_kern.h b/arch/x86/include/asm/perfmon_kern.h
new file mode 100644
index 000000000000..7cadbb894e83
--- /dev/null
+++ b/arch/x86/include/asm/perfmon_kern.h
@@ -0,0 +1,438 @@
+/*
+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Copyright (c) 2007 Advanced Micro Devices, Inc.
+ * Contributed by Robert Richter <robert.richter@amd.com>
+ *
+ * This file contains X86 Processor Family specific definitions
+ * for the perfmon interface. This covers P6, Pentium M, P4/Xeon
+ * (32-bit and 64-bit, i.e., EM64T) and AMD X86-64.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#ifndef _ASM_X86_PERFMON_KERN_H_
+#define _ASM_X86_PERFMON_KERN_H_
+
+#ifdef CONFIG_PERFMON
+#include <linux/unistd.h>
+#ifdef CONFIG_4KSTACKS
+#define PFM_ARCH_STK_ARG 8
+#else
+#define PFM_ARCH_STK_ARG 16
+#endif
+
+struct pfm_arch_pmu_info {
+ u32 flags; /* PMU feature flags */
+ /*
+ * mandatory model-specific callbacks
+ */
+ int (*stop_save)(struct pfm_context *ctx, struct pfm_event_set *set);
+ int (*has_ovfls)(struct pfm_context *ctx);
+ void (*quiesce)(void);
+
+ /*
+ * optional model-specific callbacks
+ */
+ void (*acquire_pmu_percpu)(void);
+ void (*release_pmu_percpu)(void);
+ int (*load_context)(struct pfm_context *ctx);
+ void (*unload_context)(struct pfm_context *ctx);
+};
+
+/*
+ * PMU feature flags
+ */
+#define PFM_X86_FL_NO_SHARING 0x02 /* no sharing with other subsystems */
+#define PFM_X86_FL_SHARING 0x04 /* PMU is being shared */
+
+struct pfm_x86_ctx_flags {
+ unsigned int insecure:1; /* rdpmc per-thread self-monitoring */
+ unsigned int reserved:31; /* for future use */
+};
+
+struct pfm_arch_context {
+ u64 saved_real_iip; /* instr pointer of last NMI intr */
+ struct pfm_x86_ctx_flags flags; /* flags */
+ int saved_started;
+};
+
+/*
+ * functions implemented as inline on x86
+ */
+
+/**
+ * pfm_arch_write_pmc - write a single PMC register
+ * @ctx: context to work on
+ * @cnum: PMC index
+ * @value: PMC 64-bit value
+ *
+ * in certain situations, ctx may be NULL
+ */
+static inline void pfm_arch_write_pmc(struct pfm_context *ctx,
+ unsigned int cnum, u64 value)
+{
+ /*
+ * we only write to the actual register when monitoring is
+ * active (pfm_start was issued)
+ */
+ if (ctx && ctx->flags.started == 0)
+ return;
+
+ PFM_DBG_ovfl("pfm_arch_write_pmc(0x%lx, 0x%Lx)",
+ pfm_pmu_conf->pmc_desc[cnum].hw_addr,
+ (unsigned long long) value);
+
+ wrmsrl(pfm_pmu_conf->pmc_desc[cnum].hw_addr, value);
+}
+
+/**
+ * pfm_arch_write_pmd - write a single PMD register
+ * @ctx: context to work on
+ * @cnum: PMD index
+ * @value: PMD 64-bit value
+ */
+static inline void pfm_arch_write_pmd(struct pfm_context *ctx,
+ unsigned int cnum, u64 value)
+{
+ /*
+ * to make sure the counter overflows, we set the
+ * upper bits. we also clear any other unimplemented
+ * bits as this may cause crash on some processors.
+ */
+ if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_C64)
+ value = (value | ~pfm_pmu_conf->ovfl_mask)
+ & ~pfm_pmu_conf->pmd_desc[cnum].rsvd_msk;
+
+ PFM_DBG_ovfl("pfm_arch_write_pmd(0x%lx, 0x%Lx)",
+ pfm_pmu_conf->pmd_desc[cnum].hw_addr,
+ (unsigned long long) value);
+
+ wrmsrl(pfm_pmu_conf->pmd_desc[cnum].hw_addr, value);
+}
+
+/**
+ * pfm_arch_read_pmd - read a single PMD register
+ * @ctx: context to work on
+ * @cnum: PMD index
+ *
+ * return value is register 64-bit value
+ */
+static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum)
+{
+ u64 tmp;
+
+ rdmsrl(pfm_pmu_conf->pmd_desc[cnum].hw_addr, tmp);
+
+ PFM_DBG_ovfl("pfm_arch_read_pmd(0x%lx) = 0x%Lx",
+ pfm_pmu_conf->pmd_desc[cnum].hw_addr,
+ (unsigned long long) tmp);
+ return tmp;
+}
+
+/**
+ * pfm_arch_read_pmc - read a single PMC register
+ * @ctx: context to work on
+ * @cnum: PMC index
+ *
+ * return value is register 64-bit value
+ */
+static inline u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum)
+{
+ u64 tmp;
+
+ rdmsrl(pfm_pmu_conf->pmc_desc[cnum].hw_addr, tmp);
+
+ PFM_DBG_ovfl("pfm_arch_read_pmc(0x%lx) = 0x%016Lx",
+ pfm_pmu_conf->pmc_desc[cnum].hw_addr,
+ (unsigned long long) tmp);
+ return tmp;
+}
+
+/**
+ * pfm_arch_is_active - return non-zero is monitoring has been started
+ * @ctx: context to check
+ *
+ * At certain points, perfmon needs to know if monitoring has been
+ * explicitly started.
+ *
+ * On x86, there is not other way but to use pfm_start/pfm_stop
+ * to activate monitoring, thus we can simply check flags.started
+ */
+static inline int pfm_arch_is_active(struct pfm_context *ctx)
+{
+ return ctx->flags.started;
+}
+
+
+/**
+ * pfm_arch_unload_context - detach context from thread or CPU
+ * @ctx: context to detach
+ *
+ * in system-wide ctx->task is NULL, otherwise it points to the
+ * attached thread
+ */
+static inline void pfm_arch_unload_context(struct pfm_context *ctx)
+{
+ struct pfm_arch_pmu_info *pmu_info;
+ struct pfm_arch_context *ctx_arch;
+
+ ctx_arch = pfm_ctx_arch(ctx);
+ pmu_info = pfm_pmu_info();
+
+ if (ctx_arch->flags.insecure) {
+ PFM_DBG("clear cr4.pce");
+ clear_in_cr4(X86_CR4_PCE);
+ }
+
+ if (pmu_info->unload_context)
+ pmu_info->unload_context(ctx);
+}
+
+/**
+ * pfm_arch_load_context - attach context to thread or CPU
+ * @ctx: context to attach
+ */
+static inline int pfm_arch_load_context(struct pfm_context *ctx)
+{
+ struct pfm_arch_pmu_info *pmu_info;
+ struct pfm_arch_context *ctx_arch;
+ int ret = 0;
+
+ ctx_arch = pfm_ctx_arch(ctx);
+ pmu_info = pfm_pmu_info();
+
+ /*
+ * RDPMC authorized in system-wide and
+ * per-thread self-monitoring.
+ *
+ * RDPMC only gives access to counts.
+ *
+ * The context-switch routine code does not restore
+ * all the PMD registers (optimization), thus there
+ * is a possible leak of counts there in per-thread
+ * mode.
+ */
+ if (ctx->task == current) {
+ PFM_DBG("set cr4.pce");
+ set_in_cr4(X86_CR4_PCE);
+ ctx_arch->flags.insecure = 1;
+ }
+
+ if (pmu_info->load_context)
+ ret = pmu_info->load_context(ctx);
+
+ return ret;
+}
+
+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set);
+void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx);
+void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx);
+
+/**
+ * pfm_arch_intr_freeze_pmu - stop monitoring when handling PMU interrupt
+ * @ctx: current context
+ * @set: current event set
+ *
+ * called from __pfm_interrupt_handler().
+ * ctx is not NULL. ctx is locked. interrupts are masked
+ *
+ * The following actions must take place:
+ * - stop all monitoring to ensure handler has consistent view.
+ * - collect overflowed PMDs bitmask into povfls_pmds and
+ * npend_ovfls. If no interrupt detected then npend_ovfls
+ * must be set to zero.
+ */
+static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx,
+ struct pfm_event_set *set)
+{
+ struct pfm_arch_context *ctx_arch;
+ ctx_arch = pfm_ctx_arch(ctx);
+ /*
+ * on X86, freezing is equivalent to stopping
+ */
+ pfm_arch_stop(current, ctx);
+
+ /*
+ * we mark monitoring as stopped to avoid
+ * certain side effects especially in
+ * pfm_arch_restore_pmcs()
+ */
+ ctx_arch->saved_started = ctx->flags.started;
+ ctx->flags.started = 0;
+}
+
+/**
+ * pfm_arch_intr_unfreeze_pmu - conditionally reactive monitoring
+ * @ctx: current context
+ *
+ * current context may be not when dealing when spurious interrupts
+ *
+ * Must re-activate monitoring if context is not MASKED.
+ * interrupts are masked.
+ */
+static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx)
+{
+ struct pfm_arch_context *ctx_arch;
+
+ if (ctx == NULL)
+ return;
+
+ ctx_arch = pfm_ctx_arch(ctx);
+
+ PFM_DBG_ovfl("state=%d", ctx->state);
+
+ /*
+ * restore flags.started which is cleared in
+ * pfm_arch_intr_freeze_pmu()
+ */
+ ctx->flags.started = ctx_arch->saved_started;
+
+ pfm_arch_restore_pmcs(ctx, ctx->active_set);
+}
+
+/**
+ * pfm_arch_ovfl_reset_pmd - reset pmd on overflow
+ * @ctx: current context
+ * @cnum: PMD index
+ *
+ * On some CPUs, the upper bits of a counter must be set in order for the
+ * overflow interrupt to happen. On overflow, the counter has wrapped around,
+ * and the upper bits are cleared. This function may be used to set them back.
+ *
+ * For x86, the current version loses whatever is remaining in the counter,
+ * which is usually has a small count. In order not to loose this count,
+ * we do a read-modify-write to set the upper bits while preserving the
+ * low-order bits. This is slow but works.
+ */
+static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx, unsigned int cnum)
+{
+ u64 val;
+ val = pfm_arch_read_pmd(ctx, cnum);
+ pfm_arch_write_pmd(ctx, cnum, val);
+}
+
+/**
+ * pfm_arch_context_create - create context
+ * @ctx: newly created context
+ * @flags: context flags as passed by user
+ *
+ * called from __pfm_create_context()
+ */
+static inline int pfm_arch_context_create(struct pfm_context *ctx, u32 ctx_flags)
+{
+ return 0;
+}
+
+/**
+ * pfm_arch_context_free - free context
+ * @ctx: context to free
+ */
+static inline void pfm_arch_context_free(struct pfm_context *ctx)
+{}
+
+/*
+ * functions implemented in arch/x86/perfmon/perfmon.c
+ */
+int pfm_arch_init(void);
+void pfm_arch_resend_irq(struct pfm_context *ctx);
+
+int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx);
+void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx);
+
+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set);
+int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg);
+void pfm_arch_pmu_config_remove(void);
+char *pfm_arch_get_pmu_module_name(void);
+int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds);
+void pfm_arch_pmu_release(void);
+
+static inline void pfm_arch_serialize(void)
+{}
+
+static inline void pfm_arch_arm_handle_work(struct task_struct *task)
+{}
+
+static inline void pfm_arch_disarm_handle_work(struct task_struct *task)
+{}
+
+#define PFM_ARCH_CTX_SIZE (sizeof(struct pfm_arch_context))
+/*
+ * x86 does not need extra alignment requirements for the sampling buffer
+ */
+#define PFM_ARCH_SMPL_ALIGN_SIZE 0
+
+asmlinkage void pmu_interrupt(void);
+
+static inline void pfm_arch_bv_copy(u64 *a, u64 *b, int nbits)
+{
+ bitmap_copy((unsigned long *)a,
+ (unsigned long *)b,
+ nbits);
+}
+
+static inline void pfm_arch_bv_or(u64 *a, u64 *b, u64 *c, int nbits)
+{
+ bitmap_or((unsigned long *)a,
+ (unsigned long *)b,
+ (unsigned long *)c,
+ nbits);
+}
+
+static inline void pfm_arch_bv_and(u64 *a, u64 *b, u64 *c, int nbits)
+{
+ bitmap_and((unsigned long *)a,
+ (unsigned long *)b,
+ (unsigned long *)c,
+ nbits);
+}
+
+
+static inline void pfm_arch_bv_zero(u64 *a, int nbits)
+{
+ bitmap_zero((unsigned long *)a, nbits);
+}
+
+static inline int pfm_arch_bv_weight(u64 *a, int nbits)
+{
+ return bitmap_weight((unsigned long *)a, nbits);
+}
+
+static inline void pfm_arch_bv_set_bit(int b, u64 *a)
+{
+ __set_bit(b, (unsigned long *)a);
+}
+
+static inline void pfm_arch_bv_clear_bit(int b, u64 *a)
+{
+ __clear_bit(b, (unsigned long *)a);
+}
+
+static inline int pfm_arch_bv_test_bit(int b, u64 *a)
+{
+ return test_bit(b, (unsigned long *)a);
+}
+
+static inline unsigned long pfm_arch_bv_find_next_bit(const u64 *addr,
+ unsigned long size,
+ unsigned long offset)
+{
+ return find_next_bit((unsigned long *)addr,
+ size,
+ offset);
+}
+#endif /* CONFIG_PEFMON */
+
+#endif /* _ASM_X86_PERFMON_KERN_H_ */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e44d379faad2..0ddd534bef44 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -79,6 +79,7 @@ struct thread_info {
#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
+#define TIF_PERFMON_WORK 9 /* work for pfm_handle_work() */
#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
#define TIF_IA32 17 /* 32bit process */
@@ -92,6 +93,7 @@ struct thread_info {
#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
#define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */
+#define TIF_PERFMON_CTXSW 28 /* perfmon needs ctxsw calls */
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -114,6 +116,8 @@ struct thread_info {
#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
#define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS)
+#define _TIF_PERFMON_WORK (1<<TIF_PERFMON_WORK)
+#define _TIF_PERFMON_CTXSW (1<<TIF_PERFMON_CTXSW)
/* work to do in syscall_trace_enter() */
#define _TIF_WORK_SYSCALL_ENTRY \
@@ -135,12 +139,12 @@ struct thread_info {
/* Only used for 64 bit */
#define _TIF_DO_NOTIFY_MASK \
- (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
+ (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERFMON_WORK)
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \
- _TIF_NOTSC)
+ _TIF_NOTSC|_TIF_PERFMON_CTXSW)
#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index f2bba78430a4..06908451002f 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -338,6 +338,11 @@
#define __NR_dup3 330
#define __NR_pipe2 331
#define __NR_inotify_init1 332
+#define __NR_pfm_create 333
+#define __NR_pfm_write (__NR_pfm_create+1)
+#define __NR_pfm_read (__NR_pfm_create+2)
+#define __NR_pfm_attach (__NR_pfm_create+3)
+#define __NR_pfm_set_state (__NR_pfm_create+4)
#ifdef __KERNEL__
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 834b2c1d89fb..a42bb5eb9edb 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -653,7 +653,16 @@ __SYSCALL(__NR_dup3, sys_dup3)
__SYSCALL(__NR_pipe2, sys_pipe2)
#define __NR_inotify_init1 294
__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
-
+#define __NR_pfm_create 295
+__SYSCALL(__NR_pfm_create, sys_pfm_create)
+#define __NR_pfm_write (__NR_pfm_create+1)
+__SYSCALL(__NR_pfm_write, sys_pfm_write)
+#define __NR_pfm_read (__NR_pfm_create+2)
+ __SYSCALL(__NR_pfm_read, sys_pfm_read)
+#define __NR_pfm_attach (__NR_pfm_create+3)
+__SYSCALL(__NR_pfm_attach, sys_pfm_attach)
+#define __NR_pfm_set_state (__NR_pfm_create+4)
+__SYSCALL(__NR_pfm_set_state, sys_pfm_set_state)
#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 9134de814c97..9f8826f33032 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -513,7 +513,7 @@ ENDPROC(system_call)
ALIGN
RING0_PTREGS_FRAME # can't unwind into user space anyway
work_pending:
- testb $_TIF_NEED_RESCHED, %cl
+ testw $(_TIF_NEED_RESCHED|_TIF_PERFMON_WORK), %cx
jz work_notifysig
work_resched:
call schedule
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 983d85aeccce..1d9bef0797d9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -876,7 +876,13 @@ END(error_interrupt)
ENTRY(spurious_interrupt)
apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
END(spurious_interrupt)
-
+
+#ifdef CONFIG_PERFMON
+ENTRY(pmu_interrupt)
+ apicinterrupt LOCAL_PERFMON_VECTOR,smp_pmu_interrupt
+END(pmu_interrupt)
+#endif
+
/*
* Exception entry points.
*/
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index ff0235391285..24a0140e6c36 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -11,6 +11,7 @@
#include <linux/kernel_stat.h>
#include <linux/sysdev.h>
#include <linux/bitops.h>
+#include <linux/perfmon_kern.h>
#include <asm/acpi.h>
#include <asm/atomic.h>
@@ -224,6 +225,10 @@ void __init native_init_IRQ(void)
apic_intr_init();
+#ifdef CONFIG_PERFMON
+ alloc_intr_gate(LOCAL_PERFMON_VECTOR, pmu_interrupt);
+#endif
+
if (!acpi_ioapic)
setup_irq(2, &irq2);
}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0a1302fe6d45..7ff71d4d6d9b 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -36,6 +36,7 @@
#include <linux/personality.h>
#include <linux/tick.h>
#include <linux/percpu.h>
+#include <linux/perfmon_kern.h>
#include <linux/prctl.h>
#include <linux/dmi.h>
@@ -258,6 +259,7 @@ void exit_thread(void)
ds_free(current->thread.ds_ctx);
}
#endif /* CONFIG_X86_DS */
+ pfm_exit_thread();
}
void flush_thread(void)
@@ -315,6 +317,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
savesegment(gs, p->thread.gs);
+ pfm_copy_thread(p);
+
tsk = current;
if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
@@ -458,11 +462,17 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
prev = &prev_p->thread;
next = &next_p->thread;
+ if (test_tsk_thread_flag(prev_p, TIF_PERFMON_CTXSW))
+ pfm_ctxsw_out(prev_p, next_p);
+
debugctl = update_debugctl(prev, next, prev->debugctlmsr);
if (next->debugctlmsr != debugctl)
update_debugctlmsr(next->debugctlmsr);
+ if (test_tsk_thread_flag(next_p, TIF_PERFMON_CTXSW))
+ pfm_ctxsw_in(prev_p, next_p);
+
if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
set_debugreg(next->debugreg0, 0);
set_debugreg(next->debugreg1, 1);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3180e79c3697..86099f98104a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -37,6 +37,7 @@
#include <linux/kprobes.h>
#include <linux/kdebug.h>
#include <linux/tick.h>
+#include <linux/perfmon_kern.h>
#include <linux/prctl.h>
#include <linux/uaccess.h>
#include <linux/io.h>
@@ -255,6 +256,7 @@ void exit_thread(void)
ds_free(t->ds_ctx);
}
#endif /* CONFIG_X86_DS */
+ pfm_exit_thread();
}
void flush_thread(void)
@@ -359,6 +361,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
savesegment(es, p->thread.es);
savesegment(ds, p->thread.ds);
+ pfm_copy_thread(p);
+
if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
if (!p->thread.io_bitmap_ptr) {
@@ -487,6 +491,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
prev = &prev_p->thread,
next = &next_p->thread;
+ if (test_tsk_thread_flag(prev_p, TIF_PERFMON_CTXSW))
+ pfm_ctxsw_out(prev_p, next_p);
+
debugctl = prev->debugctlmsr;
#ifdef CONFIG_X86_DS
@@ -513,6 +520,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
if (next->debugctlmsr != debugctl)
update_debugctlmsr(next->debugctlmsr);
+ if (test_tsk_thread_flag(next_p, TIF_PERFMON_CTXSW))
+ pfm_ctxsw_in(prev_p, next_p);
+
if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
loaddebug(next, 0);
loaddebug(next, 1);
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 27a5c8174322..7d6fc603dea7 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -19,6 +19,7 @@
#include <linux/wait.h>
#include <linux/tracehook.h>
#include <linux/elf.h>
+#include <linux/perfmon_kern.h>
#include <linux/smp.h>
#include <linux/mm.h>
@@ -749,6 +750,10 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
mce_notify_user();
#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
+ /* process perfmon asynchronous work (e.g. block thread or reset) */
+ if (thread_info_flags & _TIF_PERFMON_WORK)
+ pfm_handle_work(regs);
+
/* deal with pending signal delivery */
if (thread_info_flags & _TIF_SIGPENDING)
do_signal(regs);
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index d2307e41fbdb..24e389836fc0 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -21,6 +21,7 @@
#include <linux/personality.h>
#include <linux/compiler.h>
#include <linux/uaccess.h>
+#include <linux/perfmon_kern.h>
#include <asm/processor.h>
#include <asm/ucontext.h>
@@ -538,6 +539,10 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
mce_notify_user();
#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
+ /* process perfmon asynchronous work (e.g. block thread or reset) */
+ if (thread_info_flags & _TIF_PERFMON_WORK)
+ pfm_handle_work(regs);
+
/* deal with pending signal delivery */
if (thread_info_flags & _TIF_SIGPENDING)
do_signal(regs);
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d44395ff34c3..81c22739f70b 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -332,3 +332,8 @@ ENTRY(sys_call_table)
.long sys_dup3 /* 330 */
.long sys_pipe2
.long sys_inotify_init1
+ .long sys_pfm_create
+ .long sys_pfm_write
+ .long sys_pfm_read /* 335 */
+ .long sys_pfm_attach
+ .long sys_pfm_set_state
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 022cd41ea9b4..584a9ef4e44c 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -17,6 +17,7 @@
#include <linux/moduleparam.h>
#include <linux/kdebug.h>
#include <linux/cpu.h>
+#include <linux/perfmon_kern.h>
#include <asm/nmi.h>
#include <asm/msr.h>
#include <asm/apic.h>
@@ -142,12 +143,18 @@ static int nmi_setup(void)
int err = 0;
int cpu;
- if (!allocate_msrs())
+ if (pfm_session_allcpus_acquire())
+ return -EBUSY;
+
+ if (!allocate_msrs()) {
+ pfm_session_allcpus_release();
return -ENOMEM;
+ }
err = register_die_notifier(&profile_exceptions_nb);
if (err) {
free_msrs();
+ pfm_session_allcpus_release();
return err;
}
@@ -228,6 +235,7 @@ static void nmi_shutdown(void)
msrs = &get_cpu_var(cpu_msrs);
model->shutdown(msrs);
free_msrs();
+ pfm_session_allcpus_release();
put_cpu_var(cpu_msrs);
}
diff --git a/arch/x86/perfmon/Kconfig b/arch/x86/perfmon/Kconfig
new file mode 100644
index 000000000000..8144d1d0d600
--- /dev/null
+++ b/arch/x86/perfmon/Kconfig
@@ -0,0 +1,33 @@
+menu "Hardware Performance Monitoring support"
+config PERFMON
+ bool "Perfmon2 performance monitoring interface"
+ select X86_LOCAL_APIC
+ default n
+ help
+ Enables the perfmon2 interface to access the hardware
+ performance counters. See <http://perfmon2.sf.net/> for
+ more details.
+
+config PERFMON_DEBUG
+ bool "Perfmon debugging"
+ default n
+ depends on PERFMON
+ help
+ Enables perfmon debugging support
+
+config X86_PERFMON_INTEL_ARCH
+ bool "Support for Intel architectural perfmon v1/v2/v3"
+ depends on PERFMON
+ default n
+ help
+ Enables support for Intel architectural performance counters.
+ This feature was introduced with Intel Core Solo/Core Duo processors.
+
+config X86_PERFMON_AMD64
+ bool "Support AMD Athlon/Opteron hardware performance counters"
+ depends on PERFMON
+ default n
+ help
+ Enables support for Athlon/Opterton hardware performance counters.
+ Support for family 6, 15 and 16 processors.
+ endmenu
diff --git a/arch/x86/perfmon/Makefile b/arch/x86/perfmon/Makefile
new file mode 100644
index 000000000000..c0a4ca0da329
--- /dev/null
+++ b/arch/x86/perfmon/Makefile
@@ -0,0 +1,7 @@
+#
+# Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P.
+# Contributed by Stephane Eranian <eranian@hpl.hp.com>
+#
+obj-$(CONFIG_PERFMON) += perfmon.o
+obj-$(CONFIG_X86_PERFMON_INTEL_ARCH) += perfmon_intel_arch.o
+obj-$(CONFIG_X86_PERFMON_AMD64) += perfmon_amd64.o
diff --git a/arch/x86/perfmon/perfmon.c b/arch/x86/perfmon/perfmon.c
new file mode 100644
index 000000000000..844f19dc6cb0
--- /dev/null
+++ b/arch/x86/perfmon/perfmon.c
@@ -0,0 +1,619 @@
+/*
+ * This file implements the X86 specific support for the perfmon2 interface
+ *
+ * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Copyright (c) 2007 Advanced Micro Devices, Inc.
+ * Contributed by Robert Richter <robert.richter@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/interrupt.h>
+#include <linux/perfmon_kern.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/nmi.h>
+
+#include <asm/apic.h>
+
+DEFINE_PER_CPU(unsigned long, real_iip);
+DEFINE_PER_CPU(int, pfm_using_nmi);
+
+/**
+ * pfm_arch_ctxswin_thread - thread context switch in
+ * @task: task switched in
+ * @ctx: context for the task
+ * @set: active event set
+ *
+ * Called from pfm_ctxsw(). Task is guaranteed to be current.
+ * set cannot be NULL. Context is locked. Interrupts are masked.
+ *
+ * Caller has already restored all PMD and PMC registers, if
+ * necessary (i.e., lazy restore scheme).
+ *
+ * On x86, the only common code just needs to unsecure RDPMC if necessary
+ *
+ * On model-specific features, e.g., PEBS, IBS, are taken care of in the
+ * corresponding PMU description module
+ */
+void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx)
+{
+ struct pfm_arch_context *ctx_arch;
+
+ ctx_arch = pfm_ctx_arch(ctx);
+
+ /*
+ * restore saved real iip
+ */
+ if (ctx->active_set->npend_ovfls)
+ __get_cpu_var(real_iip) = ctx_arch->saved_real_iip;
+
+ /*
+ * enable RDPMC on this CPU
+ */
+ if (ctx_arch->flags.insecure)
+ set_in_cr4(X86_CR4_PCE);
+}
+
+/**
+ * pfm_arch_ctxswout_thread - context switch out thread
+ * @task: task switched out
+ * @ctx : context switched out
+ *
+ * Called from pfm_ctxsw(). Task is guaranteed to be current.
+ * Context is locked. Interrupts are masked. Monitoring may be active.
+ * PMU access is guaranteed. PMC and PMD registers are live in PMU.
+ *
+ * Return:
+ * non-zero : did not save PMDs (as part of stopping the PMU)
+ * 0 : saved PMDs (no need to save them in caller)
+ */
+int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx)
+{
+ struct pfm_arch_context *ctx_arch;
+ struct pfm_arch_pmu_info *pmu_info;
+
+ ctx_arch = pfm_ctx_arch(ctx);
+ pmu_info = pfm_pmu_info();
+
+ /*
+ * disable lazy restore of PMCS on ctxswin because
+ * we modify some of them.
+ */
+ ctx->active_set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS;
+
+ if (ctx->active_set->npend_ovfls)
+ ctx_arch->saved_real_iip = __get_cpu_var(real_iip);
+
+ /*
+ * disable RDPMC on this CPU
+ */
+ if (ctx_arch->flags.insecure)
+ clear_in_cr4(X86_CR4_PCE);
+
+ return pmu_info->stop_save(ctx, ctx->active_set);
+}
+
+/**
+ * pfm_arch_stop - deactivate monitoring
+ * @task: task to stop
+ * @ctx: context to stop
+ *
+ * Called from pfm_stop()
+ * Interrupts are masked. Context is locked. Set is the active set.
+ *
+ * For per-thread:
+ * task is not necessarily current. If not current task, then
+ * task is guaranteed stopped and off any cpu. Access to PMU
+ * is not guaranteed.
+ *
+ * must disable active monitoring. ctx cannot be NULL
+ */
+void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx)
+{
+ struct pfm_arch_pmu_info *pmu_info;
+
+ pmu_info = pfm_pmu_info();
+
+ /*
+ * no need to go through stop_save()
+ * if we are already stopped
+ */
+ if (!ctx->flags.started)
+ return;
+
+ if (task != current)
+ return;
+
+ pmu_info->stop_save(ctx, ctx->active_set);
+}
+
+
+/**
+ * pfm_arch_start - activate monitoring
+ * @task: task to start
+ * @ctx: context to stop
+ *
+ * Interrupts are masked. Context is locked.
+ *
+ * For per-thread:
+ * Task is not necessarily current. If not current task, then task
+ * is guaranteed stopped and off any cpu. No access to PMU is task
+ * is not current.
+ */
+void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx)
+{
+ /*
+ * cannot restore PMC if no access to PMU. Will be done
+ * when the thread is switched back in
+ */
+ if (task != current)
+ return;
+
+ pfm_arch_restore_pmcs(ctx, ctx->active_set);
+}
+
+/**
+ * pfm_arch_restore_pmds - reload PMD registers
+ * @ctx: context to restore from
+ * @set: current event set
+ *
+ * function called from pfm_context_load(), pfm_ctxsw()
+ *
+ * Context is locked. Interrupts are masked. Set cannot be NULL.
+ * Access to the PMU is guaranteed.
+ */
+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set)
+{
+ u16 i, num;
+
+ num = set->nused_pmds;
+
+ /*
+ * we can restore only the PMD we use because:
+ *
+ * - can only read with pfm_read_pmds() the registers
+ * declared used via pfm_write_pmds()
+ *
+ * - if cr4.pce=1, only counters are exposed to user. RDPMC
+ * does not work with other types of PMU registers.Thus, no
+ * address is ever exposed by counters
+ *
+ * - there is never a dependency between one pmd register and
+ * another
+ */
+ for (i = 0; num; i++) {
+ if (likely(pfm_arch_bv_test_bit(i, set->used_pmds))) {
+ pfm_write_pmd(ctx, i, set->pmds[i]);
+ num--;
+ }
+ }
+}
+
+/**
+ * pfm_arch_restore_pmcs - reload PMC registers
+ * @ctx: context to restore from
+ * @set: current event set
+ *
+ * function called from pfm_context_load(), pfm_ctxsw().
+ *
+ * Context is locked. Interrupts are masked. set cannot be NULL.
+ * Access to the PMU is guaranteed.
+ *
+ * function must restore all PMC registers from set
+ */
+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set)
+{
+ u16 i, num;
+
+ /*
+ * we need to restore PMCs only when:
+ * - context is not masked
+ * - monitoring activated
+ *
+ * Masking monitoring after an overflow does not change the
+ * value of flags.started
+ */
+ if (!ctx->flags.started)
+ return;
+
+ /*
+ * restore all pmcs
+ *
+ * It is not possible to restore only the pmcs we used because
+ * certain PMU models (e.g. Pentium 4) have dependencies. Thus
+ * we do not want one application using stale PMCs coming from
+ * another one.
+ *
+ * On PMU models where there is no dependencies between PMCs, then
+ * it is possible to optimize by only restoring the registers that
+ * are used, but this has to be done by model-specific code.
+ */
+ num = ctx->regs.num_pmcs;
+ for (i = 0; num; i++) {
+ if (pfm_arch_bv_test_bit(i, ctx->regs.pmcs)) {
+ pfm_arch_write_pmc(ctx, i, set->pmcs[i]);
+ num--;
+ }
+ }
+}
+
+/**
+ * smp_pmu_interrupt - lowest level PMU interrupt handler for X86
+ * @regs: machine state
+ *
+ * The PMU interrupt is handled through an interrupt gate, therefore
+ * the CPU automatically clears the EFLAGS.IF, i.e., masking interrupts.
+ *
+ * The perfmon interrupt handler MUST run with interrupts disabled due
+ * to possible race with other, higher priority interrupts, such as timer
+ * or IPI function calls.
+ *
+ * See description in IA-32 architecture manual, Vol 3 section 5.8.1
+ */
+void smp_pmu_interrupt(struct pt_regs *regs)
+{
+ unsigned long iip;
+ int using_nmi;
+
+ using_nmi = __get_cpu_var(pfm_using_nmi);
+
+ ack_APIC_irq();
+
+ irq_enter();
+
+ /*
+ * when using NMI, pfm_handle_nmi() gets called
+ * first. It stops monitoring and record the
+ * iip into real_iip, then it repost the interrupt
+ * using the lower priority vector LOCAL_PERFMON_VECTOR
+ *
+ * On some processors, e.g., P4, it may be that some
+ * state is already recorded from pfm_handle_nmi()
+ * and it only needs to be copied back into the normal
+ * fields so it can be used transparently by higher level
+ * code.
+ */
+ if (using_nmi)
+ iip = __get_cpu_var(real_iip);
+ else
+ iip = instruction_pointer(regs);
+
+ pfm_interrupt_handler(iip, regs);
+
+ /*
+ * On Intel processors:
+ * - it is necessary to clear the MASK field for the LVTPC
+ * vector. Otherwise interrupts remain masked. See
+ * section 8.5.1
+ * AMD X86-64:
+ * - the documentation does not stipulate the behavior but
+ * it seems to work without the write, so we skip
+ */
+ if (!using_nmi && current_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ apic_write(APIC_LVTPC, LOCAL_PERFMON_VECTOR);
+
+ irq_exit();
+}
+
+/**
+ * pfm_handle_nmi - PMU NMI handler notifier callback
+ * @nb ; notifier block
+ * @val: type of die notifier
+ * @data: die notifier-specific data
+ *
+ * called from notify_die() notifier from an trap handler path. We only
+ * care about NMI related callbacks, and ignore everything else.
+ *
+ * Cannot grab any locks, include the perfmon context lock
+ *
+ * Must detect if NMI interrupt comes from perfmon, and if so it must
+ * stop the PMU and repost a lower-priority interrupt. The perfmon interrupt
+ * handler needs to grab the context lock, thus is cannot be run directly
+ * from the NMI interrupt call path.
+ */
+static int __kprobes pfm_handle_nmi(struct notifier_block *nb,
+ unsigned long val,
+ void *data)
+{
+ struct die_args *args = data;
+ struct pfm_context *ctx;
+ struct pfm_arch_pmu_info *pmu_info;
+
+ /*
+ * only NMI related calls
+ */
+ if (val != DIE_NMI_IPI)
+ return NOTIFY_DONE;
+
+ /*
+ * perfmon not using NMI
+ */
+ if (!__get_cpu_var(pfm_using_nmi))
+ return NOTIFY_DONE;
+
+ /*
+ * No context
+ */
+ ctx = __get_cpu_var(pmu_ctx);
+ if (!ctx) {
+ PFM_DBG_ovfl("no ctx");
+ return NOTIFY_DONE;
+ }
+
+ /*
+ * Detect if we have overflows, i.e., NMI interrupt
+ * caused by PMU
+ */
+ pmu_info = pfm_pmu_info();
+ if (!pmu_info->has_ovfls(ctx)) {
+ PFM_DBG_ovfl("no ovfl");
+ return NOTIFY_DONE;
+ }
+
+ /*
+ * we stop the PMU to avoid further overflow before this
+ * one is treated by lower priority interrupt handler
+ */
+ pmu_info->quiesce();
+
+ /*
+ * record actual instruction pointer
+ */
+ __get_cpu_var(real_iip) = instruction_pointer(args->regs);
+
+ /*
+ * post lower priority interrupt (LOCAL_PERFMON_VECTOR)
+ */
+ pfm_arch_resend_irq(ctx);
+
+ /*
+ * we need to rewrite the APIC vector on Intel
+ */
+ if (current_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+ /*
+ * the notification was for us
+ */
+ return NOTIFY_STOP;
+}
+
+static struct notifier_block pfm_nmi_nb = {
+ .notifier_call = pfm_handle_nmi
+};
+
+/**
+ * pfm_arch_resend_irq - post perfmon interrupt on regular vector
+ *
+ * called from pfm_ctxswin_thread() and pfm_handle_nmi()
+ */
+void pfm_arch_resend_irq(struct pfm_context *ctx)
+{
+ unsigned long val, dest;
+ /*
+ * we cannot use hw_resend_irq() because it goes to
+ * the I/O APIC. We need to go to the Local APIC.
+ *
+ * The "int vec" is not the right solution either
+ * because it triggers a software intr. We need
+ * to regenerate the interrupt and have it pended
+ * until we unmask interrupts.
+ *
+ * Instead we send ourself an IPI on the perfmon
+ * vector.
+ */
+ val = APIC_DEST_SELF|APIC_INT_ASSERT|
+ APIC_DM_FIXED|LOCAL_PERFMON_VECTOR;
+
+ dest = apic_read(APIC_ID);
+ apic_write(APIC_ICR2, dest);
+ apic_write(APIC_ICR, val);
+}
+
+/**
+ * pfm_arch_pmu_acquire_percpu - setup APIC per CPU
+ * @data: contains pmu flags
+ */
+static void pfm_arch_pmu_acquire_percpu(void *data)
+{
+ struct pfm_arch_pmu_info *pmu_info;
+ unsigned int tmp, vec;
+ unsigned long flags = (unsigned long)data;
+ unsigned long lvtpc;
+
+ pmu_info = pfm_pmu_conf->pmu_info;
+ /*
+ * we only reprogram the LVTPC vector if we have detected
+ * no sharing, otherwise it means the APIC is already programmed
+ * and we use whatever vector (likely NMI) is there
+ */
+ if (!(flags & PFM_X86_FL_SHARING)) {
+ vec = LOCAL_PERFMON_VECTOR;
+
+ tmp = apic_read(APIC_LVTERR);
+ apic_write(APIC_LVTERR, tmp | APIC_LVT_MASKED);
+ apic_write(APIC_LVTPC, vec);
+ apic_write(APIC_LVTERR, tmp);
+ }
+ lvtpc = (unsigned long)apic_read(APIC_LVTPC);
+
+ __get_cpu_var(pfm_using_nmi) = lvtpc == APIC_DM_NMI;
+
+ PFM_DBG("LTVPC=0x%lx using_nmi=%d",
+ lvtpc, __get_cpu_var(pfm_using_nmi));
+ /*
+ * invoke model specific acquire routine.
+ */
+ if (pmu_info->acquire_pmu_percpu)
+ pmu_info->acquire_pmu_percpu();
+}
+
+/**
+ * pfm_arch_pmu_acquire - acquire PMU resource from system
+ * @unavail_pmcs : bitmask to use to set unavailable pmcs
+ * @unavail_pmds : bitmask to use to set unavailable pmds
+ *
+ * interrupts are not masked
+ *
+ * Grab PMU registers from lower level MSR allocator
+ *
+ * Program the APIC according the possible interrupt vector
+ * either LOCAL_PERFMON_VECTOR or NMI
+ */
+int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds)
+{
+ struct pfm_arch_pmu_info *pmu_info;
+ struct pfm_regmap_desc *d;
+ u16 i, nlost;
+
+ pmu_info = pfm_pmu_conf->pmu_info;
+ pmu_info->flags &= ~PFM_X86_FL_SHARING;
+
+ nlost = 0;
+
+ d = pfm_pmu_conf->pmc_desc;
+ for (i = 0; i < pfm_pmu_conf->num_pmc_entries; i++, d++) {
+ if (!(d->type & PFM_REG_I))
+ continue;
+
+ /*
+ * reserve register with lower-level allocator
+ */
+ if (!reserve_evntsel_nmi(d->hw_addr)) {
+ PFM_DBG("pmc%d(%s) already used", i, d->desc);
+ pfm_arch_bv_set_bit(i, unavail_pmcs);
+ nlost++;
+ continue;
+ }
+ }
+ PFM_DBG("nlost=%d info_flags=0x%x\n", nlost, pmu_info->flags);
+ /*
+ * some PMU models (e.g., P6) do not support sharing
+ * so check if we found less than the expected number of PMC registers
+ */
+ if (nlost) {
+ if (pmu_info->flags & PFM_X86_FL_NO_SHARING) {
+ PFM_INFO("PMU already used by another subsystem, "
+ "PMU does not support sharing, "
+ "try disabling Oprofile or "
+ "reboot with nmi_watchdog=0");
+ goto undo;
+ }
+ pmu_info->flags |= PFM_X86_FL_SHARING;
+ }
+
+ d = pfm_pmu_conf->pmd_desc;
+ for (i = 0; i < pfm_pmu_conf->num_pmd_entries; i++, d++) {
+ if (!(d->type & PFM_REG_I))
+ continue;
+
+ if (!reserve_perfctr_nmi(d->hw_addr)) {
+ PFM_DBG("pmd%d(%s) already used", i, d->desc);
+ pfm_arch_bv_set_bit(i, unavail_pmds);
+ }
+ }
+ /*
+ * program APIC on each CPU
+ */
+ on_each_cpu(pfm_arch_pmu_acquire_percpu,
+ (void *)(unsigned long)pmu_info->flags , 1);
+
+ return 0;
+undo:
+ /*
+ * must undo reservation of pmcs in case of error
+ */
+ d = pfm_pmu_conf->pmc_desc;
+ for (i = 0; i < pfm_pmu_conf->num_pmc_entries; i++, d++) {
+ if (!(d->type & PFM_REG_I))
+ continue;
+ if (!pfm_arch_bv_test_bit(i, unavail_pmcs))
+ release_evntsel_nmi(d->hw_addr);
+ }
+ return -EBUSY;
+}
+
+/**
+ * pfm-arch_pmu_release_percpu - clear NMI state for one CPU
+ *
+ */
+static void pfm_arch_pmu_release_percpu(void *data)
+{
+ struct pfm_arch_pmu_info *pmu_info;
+
+ pmu_info = pfm_pmu_conf->pmu_info;
+
+ __get_cpu_var(pfm_using_nmi) = 0;
+ /*
+ * invoke model specific release routine.
+ */
+ if (pmu_info->release_pmu_percpu)
+ pmu_info->release_pmu_percpu();
+}
+
+/**
+ * pfm_arch_pmu_release - release PMU resource to system
+ *
+ * called from pfm_pmu_release()
+ * interrupts are not masked
+ *
+ * On x86, we return the PMU registers to the MSR allocator
+ */
+void pfm_arch_pmu_release(void)
+{
+ struct pfm_regmap_desc *d;
+ u16 i, n;
+
+ d = pfm_pmu_conf->pmc_desc;
+ n = pfm_pmu_conf->regs_all.num_pmcs;
+ for (i = 0; n; i++, d++) {
+ if (!pfm_arch_bv_test_bit(i, pfm_pmu_conf->regs_all.pmcs))
+ continue;
+ release_evntsel_nmi(d->hw_addr);
+ n--;
+ PFM_DBG("pmc%u released", i);
+ }
+ d = pfm_pmu_conf->pmd_desc;
+ n = pfm_pmu_conf->regs_all.num_pmds;
+ for (i = 0; n; i++, d++) {
+ if (!pfm_arch_bv_test_bit(i, pfm_pmu_conf->regs_all.pmds))
+ continue;
+ release_perfctr_nmi(d->hw_addr);
+ n--;
+ PFM_DBG("pmd%u released", i);
+ }
+
+ /* clear NMI variable if used */
+ if (__get_cpu_var(pfm_using_nmi))
+ on_each_cpu(pfm_arch_pmu_release_percpu, NULL , 1);
+}
+
+/**
+ * pfm_arch_init - one time global arch-specific initialization
+ *
+ * called from pfm_init()
+ */
+int __init pfm_arch_init(void)
+{
+ /*
+ * we need to register our NMI handler when the kernels boots
+ * to avoid a deadlock condition with the NMI watchdog or Oprofile
+ * if we were to try and register/unregister on-demand.
+ */
+ register_die_notifier(&pfm_nmi_nb);
+ return 0;
+}
diff --git a/arch/x86/perfmon/perfmon_amd64.c b/arch/x86/perfmon/perfmon_amd64.c
new file mode 100644
index 000000000000..f078fe28137d
--- /dev/null
+++ b/arch/x86/perfmon/perfmon_amd64.c
@@ -0,0 +1,483 @@
+/*
+ * This file contains the PMU description for the Athlon64 and Opteron64
+ * processors. It supports 32 and 64-bit modes.
+ *
+ * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Copyright (c) 2007 Advanced Micro Devices, Inc.
+ * Contributed by Robert Richter <robert.richter@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kprobes.h>
+#include <linux/vmalloc.h>
+#include <linux/topology.h>
+#include <linux/pci.h>
+#include <linux/perfmon_kern.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+
+static void __kprobes pfm_amd64_quiesce(void);
+static int pfm_amd64_has_ovfls(struct pfm_context *ctx);
+static int pfm_amd64_stop_save(struct pfm_context *ctx,
+ struct pfm_event_set *set);
+
+static u64 enable_mask[PFM_MAX_PMCS];
+static u16 max_enable;
+
+static struct pfm_arch_pmu_info pfm_amd64_pmu_info = {
+ .stop_save = pfm_amd64_stop_save,
+ .has_ovfls = pfm_amd64_has_ovfls,
+ .quiesce = pfm_amd64_quiesce,
+};
+
+/*
+ * force Local APIC interrupt on overflow
+ */
+#define PFM_K8_VAL (1ULL<<20)
+#define PFM_K8_NO64 (1ULL<<20)
+
+/*
+ * reserved bits must be 1
+ *
+ * for family 15:
+ * - upper 32 bits are reserved
+ * - bit 20, bit 21
+ *
+ * for family 16:
+ * - bits 36-39 are reserved
+ * - bits 42-63 are reserved
+ * - bit 20, bit 21
+ *
+ */
+#define PFM_K8_RSVD ((~((1ULL<<32)-1)) | (1ULL<<20) | (1ULL<<21))
+#define PFM_16_RSVD ((0x3fffffULL<<42) | (0xfULL<<36) | (1ULL<<20) | (1ULL<<21))
+
+static struct pfm_regmap_desc pfm_amd64_pmc_desc[] = {
+/* pmc0 */ PMC_D(PFM_REG_I64, "PERFSEL0", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL0),
+/* pmc1 */ PMC_D(PFM_REG_I64, "PERFSEL1", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL1),
+/* pmc2 */ PMC_D(PFM_REG_I64, "PERFSEL2", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL2),
+/* pmc3 */ PMC_D(PFM_REG_I64, "PERFSEL3", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL3),
+};
+#define PFM_AMD_NUM_PMCS ARRAY_SIZE(pfm_amd64_pmc_desc)
+
+/*
+ * AMD64 counters are 48 bits, upper bits are reserved
+ */
+#define PFM_AMD64_CTR_RSVD (~((1ULL<<48)-1))
+
+#define PFM_AMD_D(n) \
+ { .type = PFM_REG_C, \
+ .desc = "PERFCTR"#n, \
+ .hw_addr = MSR_K7_PERFCTR0+n, \
+ .rsvd_msk = PFM_AMD64_CTR_RSVD, \
+ .dep_pmcs[0] = 1ULL << n \
+ }
+
+static struct pfm_regmap_desc pfm_amd64_pmd_desc[] = {
+/* pmd0 */ PFM_AMD_D(0),
+/* pmd1 */ PFM_AMD_D(1),
+/* pmd2 */ PFM_AMD_D(2),
+/* pmd3 */ PFM_AMD_D(3)
+};
+#define PFM_AMD_NUM_PMDS ARRAY_SIZE(pfm_amd64_pmd_desc)
+
+static struct pfm_context *pfm_nb_task_owner;
+
+static struct pfm_pmu_config pfm_amd64_pmu_conf;
+
+/**
+ * pfm_amd64_acquire_nb -- ensure mutual exclusion for Northbridge events
+ * @ctx: context to use
+ *
+ * There can only be one user per socket for the Northbridge (NB) events,
+ * so we enforce mutual exclusion as follows:
+ * - per-thread : only one context machine-wide can use NB events
+ *
+ * Exclusion is enforced at:
+ * - pfm_load_context()
+ * - pfm_write_pmcs() for attached contexts
+ *
+ * Exclusion is released at:
+ * - pfm_unload_context() or any calls that implicitely uses it
+ *
+ * return:
+ * 0 : successfully acquire NB access
+ * < 0: errno, failed to acquire NB access
+ */
+static int pfm_amd64_acquire_nb(struct pfm_context *ctx)
+{
+ struct pfm_context **entry, *old;
+ int proc_id;
+
+#ifdef CONFIG_SMP
+ proc_id = cpu_data(smp_processor_id()).phys_proc_id;
+#else
+ proc_id = 0;
+#endif
+
+ entry = &pfm_nb_task_owner;
+
+ old = cmpxchg(entry, NULL, ctx);
+ if (!old) {
+ PFM_DBG("acquired Northbridge event access globally");
+ } else if (old != ctx) {
+ PFM_DBG("global NorthBridge event conflict");
+ return -EBUSY;
+ }
+ return 0;
+}
+
+/**
+ * pfm_amd64_pmc_write_check -- check validity of pmc writes
+ * @ctx: context to use
+ * @set: event set to use
+ * @req: user request to modify the pmc
+ *
+ * invoked from pfm_write_pmcs() when pfm_nb_sys_owners is not NULL,i.e.,
+ * when we have detected a multi-core processor.
+ *
+ * context is locked, interrupts are masked
+ */
+static int pfm_amd64_pmc_write_check(struct pfm_context *ctx,
+ struct pfm_event_set *set,
+ struct pfarg_pmr *req)
+{
+ unsigned int event;
+
+ /*
+ * delay checking NB event until we load the context
+ */
+ if (ctx->state == PFM_CTX_UNLOADED)
+ return 0;
+
+ /*
+ * check event is NB event
+ */
+ event = (unsigned int)(req->reg_value & 0xff);
+ if (event < 0xee)
+ return 0;
+
+ return pfm_amd64_acquire_nb(ctx);
+}
+
+/**
+ * pfm_amd64_load_context - amd64 model-specific load callback
+ * @ctx: context to use
+ *
+ * invoked on pfm_load_context().
+ * context is locked, interrupts are masked
+ */
+static int pfm_amd64_load_context(struct pfm_context *ctx)
+{
+ struct pfm_event_set *set;
+ unsigned int i, n;
+
+ set = ctx->active_set;
+ n = set->nused_pmcs;
+ for (i = 0; n; i++) {
+ if (!pfm_arch_bv_test_bit(i, set->used_pmcs))
+ continue;
+
+ if ((set->pmcs[i] & 0xff) >= 0xee)
+ goto found;
+ n--;
+ }
+ return 0;
+found:
+ return pfm_amd64_acquire_nb(ctx);
+}
+
+/**
+ * pfm_amd64_unload_context -- amd64 mdoels-specific unload callback
+ * @ctx: context to use
+ *
+ * invoked on pfm_unload_context()
+ */
+static void pfm_amd64_unload_context(struct pfm_context *ctx)
+{
+ struct pfm_context **entry, *old;
+ int proc_id;
+
+#ifdef CONFIG_SMP
+ proc_id = cpu_data(smp_processor_id()).phys_proc_id;
+#else
+ proc_id = 0;
+#endif
+
+ entry = &pfm_nb_task_owner;
+
+ old = cmpxchg(entry, ctx, NULL);
+ if (old == ctx)
+ PFM_DBG("released NorthBridge events globally");
+}
+
+/**
+ * pfm_amd64_setup_nb_event_ctrl -- initialize NB event controls
+ *
+ * detect if we need to activate NorthBridge event access control
+ */
+static int pfm_amd64_setup_nb_event_ctrl(void)
+{
+ unsigned int c, n = 0;
+ unsigned int max_phys = 0;
+
+#ifdef CONFIG_SMP
+ for_each_possible_cpu(c) {
+ if (cpu_data(c).phys_proc_id > max_phys)
+ max_phys = cpu_data(c).phys_proc_id;
+ }
+#else
+ max_phys = 0;
+#endif
+ if (max_phys > 255) {
+ PFM_INFO("socket id %d is too big to handle", max_phys);
+ return -ENOMEM;
+ }
+
+ n = max_phys + 1;
+ if (n < 2)
+ return 0;
+
+ pfm_nb_task_owner = NULL;
+
+ /*
+ * activate write-checker for PMC registers
+ */
+ for (c = 0; c < PFM_AMD_NUM_PMCS; c++)
+ pfm_amd64_pmc_desc[c].type |= PFM_REG_WC;
+
+ pfm_amd64_pmu_info.load_context = pfm_amd64_load_context;
+ pfm_amd64_pmu_info.unload_context = pfm_amd64_unload_context;
+
+ pfm_amd64_pmu_conf.pmc_write_check = pfm_amd64_pmc_write_check;
+
+ PFM_INFO("NorthBridge event access control enabled");
+
+ return 0;
+}
+
+/**
+ * pfm_amd64_setup_register -- initialize register table
+ *
+ * modify register table based on actual host CPU
+ */
+static void pfm_amd64_setup_registers(void)
+{
+ u16 i;
+
+ pfm_arch_bv_set_bit(0, enable_mask);
+ pfm_arch_bv_set_bit(1, enable_mask);
+ pfm_arch_bv_set_bit(2, enable_mask);
+ pfm_arch_bv_set_bit(3, enable_mask);
+ max_enable = 3+1;
+
+ /*
+ * adjust reserved bit fields for family 16
+ */
+ if (current_cpu_data.x86 == 16) {
+ for (i = 0; i < PFM_AMD_NUM_PMCS; i++)
+ if (pfm_amd64_pmc_desc[i].rsvd_msk == PFM_K8_RSVD)
+ pfm_amd64_pmc_desc[i].rsvd_msk = PFM_16_RSVD;
+ }
+}
+
+/**
+ * pfm_amd64_probe_pmu -- detect host PMU
+ */
+static int pfm_amd64_probe_pmu(void)
+{
+ if (current_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return -1;
+
+ switch (current_cpu_data.x86) {
+ case 6:
+ case 15:
+ case 16:
+ PFM_INFO("found family=%d", current_cpu_data.x86);
+ break;
+ default:
+ PFM_INFO("unsupported family=%d", current_cpu_data.x86);
+ return -1;
+ }
+
+ /*
+ * check for local APIC (required)
+ */
+ if (!cpu_has_apic) {
+ PFM_INFO("no local APIC, unsupported");
+ return -1;
+ }
+
+ if (current_cpu_data.x86_max_cores > 1
+ && pfm_amd64_setup_nb_event_ctrl())
+ return -1;
+
+ pfm_amd64_setup_registers();
+
+ return 0;
+}
+
+/**
+ * pfm_amd64_has_ovfls -- detect if pending overflows
+ * @ctx: context to use
+ *
+ * detect is counters have overflowed.
+ * return:
+ * 0 : no overflow
+ * 1 : at least one overflow
+ */
+static int __kprobes pfm_amd64_has_ovfls(struct pfm_context *ctx)
+{
+ struct pfm_regmap_desc *xrd;
+ u64 *cnt_mask;
+ u64 wmask, val;
+ u16 i, num;
+
+ /*
+ * Check regular counters
+ */
+ cnt_mask = ctx->regs.cnt_pmds;
+ num = ctx->regs.num_counters;
+ wmask = 1ULL << pfm_pmu_conf->counter_width;
+ xrd = pfm_amd64_pmd_desc;
+
+ for (i = 0; num; i++) {
+ if (pfm_arch_bv_test_bit(i, cnt_mask)) {
+ rdmsrl(xrd[i].hw_addr, val);
+ if (!(val & wmask))
+ return 1;
+ num--;
+ }
+ }
+ return 0;
+}
+
+/**
+ * pfm_amd64_stop_save - stop monitoring, collect pending overflows
+ * @ctx: context to use
+ * @set: event set to stop
+ *
+ * interrupts are masked, PMU access guaranteed
+ */
+static int pfm_amd64_stop_save(struct pfm_context *ctx,
+ struct pfm_event_set *set)
+{
+ struct pfm_arch_pmu_info *pmu_info;
+ u64 used_mask[PFM_PMC_BV];
+ u64 *cnt_pmds;
+ u64 val, wmask, ovfl_mask;
+ u32 i, count;
+
+ pmu_info = pfm_pmu_info();
+
+ wmask = 1ULL << pfm_pmu_conf->counter_width;
+
+ pfm_arch_bv_and(used_mask,
+ set->used_pmcs,
+ enable_mask,
+ max_enable);
+
+ count = pfm_arch_bv_weight(used_mask, max_enable);
+
+ /*
+ * stop monitoring
+ * Unfortunately, this is very expensive!
+ * wrmsrl() is serializing.
+ */
+ for (i = 0; count; i++) {
+ if (pfm_arch_bv_test_bit(i, used_mask)) {
+ wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0);
+ count--;
+ }
+ }
+
+ /*
+ * if we already having a pending overflow condition, we simply
+ * return to take care of this first.
+ */
+ if (set->npend_ovfls)
+ return 1;
+
+ ovfl_mask = pfm_pmu_conf->ovfl_mask;
+ cnt_pmds = ctx->regs.cnt_pmds;
+
+ /*
+ * check for pending overflows and save PMDs (combo)
+ * we employ used_pmds because we also need to save
+ * and not just check for pending interrupts.
+ */
+ count = set->nused_pmds;
+ for (i = 0; count; i++) {
+ if (pfm_arch_bv_test_bit(i, set->used_pmds)) {
+ val = pfm_arch_read_pmd(ctx, i);
+ if (likely(pfm_arch_bv_test_bit(i, cnt_pmds))) {
+ if (!(val & wmask)) {
+ pfm_arch_bv_set_bit(i,set->povfl_pmds);
+ set->npend_ovfls++;
+ }
+ val = (set->pmds[i] & ~ovfl_mask)
+ | (val & ovfl_mask);
+ }
+ set->pmds[i] = val;
+ count--;
+ }
+ }
+ /* 0 means: no need to save PMDs at upper level */
+ return 0;
+}
+
+/**
+ * pfm_amd64_quiesce_pmu -- stop monitoring without grabbing any lock
+ *
+ * called from NMI interrupt handler to immediately stop monitoring
+ * cannot grab any lock, including perfmon related locks
+ */
+static void __kprobes pfm_amd64_quiesce(void)
+{
+ /*
+ * quiesce PMU by clearing available registers that have
+ * the start/stop capability
+ */
+ if (pfm_arch_bv_test_bit(0, pfm_pmu_conf->regs_all.pmcs))
+ wrmsrl(MSR_K7_EVNTSEL0, 0);
+ if (pfm_arch_bv_test_bit(1, pfm_pmu_conf->regs_all.pmcs))
+ wrmsrl(MSR_K7_EVNTSEL0+1, 0);
+ if (pfm_arch_bv_test_bit(2, pfm_pmu_conf->regs_all.pmcs))
+ wrmsrl(MSR_K7_EVNTSEL0+2, 0);
+ if (pfm_arch_bv_test_bit(3, pfm_pmu_conf->regs_all.pmcs))
+ wrmsrl(MSR_K7_EVNTSEL0+3, 0);
+}
+
+static struct pfm_pmu_config pfm_amd64_pmu_conf = {
+ .pmu_name = "AMD64",
+ .counter_width = 47,
+ .pmd_desc = pfm_amd64_pmd_desc,
+ .pmc_desc = pfm_amd64_pmc_desc,
+ .num_pmc_entries = PFM_AMD_NUM_PMCS,
+ .num_pmd_entries = PFM_AMD_NUM_PMDS,
+ .version = "1.2",
+ .pmu_info = &pfm_amd64_pmu_info
+};
+
+static int __init pfm_amd64_pmu_init_module(void)
+{
+ if (pfm_amd64_probe_pmu())
+ return -ENOSYS;
+ return pfm_pmu_register(&pfm_amd64_pmu_conf);
+}
+
+device_initcall(pfm_amd64_pmu_init_module);
diff --git a/arch/x86/perfmon/perfmon_intel_arch.c b/arch/x86/perfmon/perfmon_intel_arch.c
new file mode 100644
index 000000000000..ce4293dcfcda
--- /dev/null
+++ b/arch/x86/perfmon/perfmon_intel_arch.c
@@ -0,0 +1,628 @@
+/*
+ * This file contains the Intel architectural perfmon v1, v2, v3
+ * description tables.
+ *
+ * Architectural perfmon was introduced with Intel Core Solo/Duo
+ * processors.
+ *
+ * Copyright (c) 2006-2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kprobes.h>
+#include <linux/perfmon_kern.h>
+#include <asm/msr.h>
+#include <asm/apic.h>
+
+static u64 enable_mask[PFM_MAX_PMCS];
+static u16 max_enable;
+static int pfm_intel_arch_version;
+
+DEFINE_PER_CPU(u64, saved_global_ctrl);
+
+/*
+ * - upper 32 bits are reserved
+ * - INT: APIC enable bit is reserved (forced to 1)
+ * - bit 21 is reserved
+ *
+ * RSVD: reserved bits are 1
+ */
+#define PFM_IA_PMC_RSVD ((~((1ULL<<32)-1)) \
+ | (1ULL<<20) \
+ | (1ULL<<21))
+
+/*
+ * force Local APIC interrupt on overflow
+ * disable with NO_EMUL64
+ */
+#define PFM_IA_PMC_VAL (1ULL<<20)
+#define PFM_IA_NO64 (1ULL<<20)
+
+/*
+ * architectuture specifies that:
+ * IA32_PMCx MSR : starts at 0x0c1 & occupy a contiguous block of MSR
+ * IA32_PERFEVTSELx MSR : starts at 0x186 & occupy a contiguous block of MSR
+ * MSR_GEN_FIXED_CTR0 : starts at 0x309 & occupy a contiguous block of MSR
+ */
+#define MSR_GEN_SEL_BASE MSR_P6_EVNTSEL0
+#define MSR_GEN_PMC_BASE MSR_P6_PERFCTR0
+#define MSR_GEN_FIXED_PMC_BASE MSR_CORE_PERF_FIXED_CTR0
+
+/*
+ * layout of EAX for CPUID.0xa leaf function
+ */
+struct pmu_eax {
+ unsigned int version:8; /* architectural perfmon version */
+ unsigned int num_cnt:8; /* number of generic counters */
+ unsigned int cnt_width:8; /* width of generic counters */
+ unsigned int ebx_length:8; /* number of architected events */
+};
+
+/*
+ * layout of EDX for CPUID.0xa leaf function when perfmon v2 is detected
+ */
+struct pmu_edx {
+ unsigned int num_cnt:5; /* number of fixed counters */
+ unsigned int cnt_width:8; /* width of fixed counters */
+ unsigned int reserved:19;
+};
+
+static void pfm_intel_arch_acquire_pmu_percpu(void);
+static void pfm_intel_arch_release_pmu_percpu(void);
+static int pfm_intel_arch_stop_save(struct pfm_context *ctx,
+ struct pfm_event_set *set);
+static int pfm_intel_arch_has_ovfls(struct pfm_context *ctx);
+static void __kprobes pfm_intel_arch_quiesce(void);
+
+/*
+ * physical addresses of MSR controlling the perfevtsel and counter registers
+ */
+struct pfm_arch_pmu_info pfm_intel_arch_pmu_info = {
+ .stop_save = pfm_intel_arch_stop_save,
+ .has_ovfls = pfm_intel_arch_has_ovfls,
+ .quiesce = pfm_intel_arch_quiesce,
+ .acquire_pmu_percpu = pfm_intel_arch_acquire_pmu_percpu,
+ .release_pmu_percpu = pfm_intel_arch_release_pmu_percpu
+};
+
+#define PFM_IA_C(n) { \
+ .type = PFM_REG_I64, \
+ .desc = "PERFEVTSEL"#n, \
+ .dfl_val = PFM_IA_PMC_VAL, \
+ .rsvd_msk = PFM_IA_PMC_RSVD, \
+ .no_emul64_msk = PFM_IA_NO64, \
+ .hw_addr = MSR_GEN_SEL_BASE+(n) \
+ }
+
+#define PFM_IA_D(n) \
+ { .type = PFM_REG_C, \
+ .desc = "PMC"#n, \
+ .hw_addr = MSR_P6_PERFCTR0+n, \
+ .dep_pmcs[0] = 1ULL << n \
+ }
+
+#define PFM_IA_FD(n) \
+ { .type = PFM_REG_C, \
+ .desc = "FIXED_CTR"#n, \
+ .hw_addr = MSR_CORE_PERF_FIXED_CTR0+n,\
+ .dep_pmcs[0] = 1ULL << 16 \
+ }
+
+
+static struct pfm_regmap_desc pfm_intel_arch_pmc_desc[] = {
+/* pmc0 */ PFM_IA_C(0), PFM_IA_C(1), PFM_IA_C(2), PFM_IA_C(3),
+/* pmc4 */ PFM_IA_C(4), PFM_IA_C(5), PFM_IA_C(6), PFM_IA_C(7),
+/* pmc8 */ PFM_IA_C(8), PFM_IA_C(9), PFM_IA_C(10), PFM_IA_C(11),
+/* pmc12 */ PFM_IA_C(12), PFM_IA_C(13), PFM_IA_C(14), PFM_IA_C(15),
+
+/* pmc16 */ { .type = PFM_REG_I,
+ .desc = "FIXED_CTRL",
+ .dfl_val = 0x8888888888888888ULL, /* force PMI */
+ .rsvd_msk = 0, /* set dynamically */
+ .no_emul64_msk = 0,
+ .hw_addr = MSR_CORE_PERF_FIXED_CTR_CTRL
+ },
+};
+#define PFM_IA_MAX_PMCS ARRAY_SIZE(pfm_intel_arch_pmc_desc)
+
+static struct pfm_regmap_desc pfm_intel_arch_pmd_desc[] = {
+/* pmd0 */ PFM_IA_D(0), PFM_IA_D(1), PFM_IA_D(2), PFM_IA_D(3),
+/* pmd4 */ PFM_IA_D(4), PFM_IA_D(5), PFM_IA_D(6), PFM_IA_D(7),
+/* pmd8 */ PFM_IA_D(8), PFM_IA_D(9), PFM_IA_D(10), PFM_IA_D(11),
+/* pmd12 */ PFM_IA_D(12), PFM_IA_D(13), PFM_IA_D(14), PFM_IA_D(15),
+
+/* pmd16 */ PFM_IA_FD(0), PFM_IA_FD(1), PFM_IA_FD(2), PFM_IA_FD(3),
+/* pmd20 */ PFM_IA_FD(4), PFM_IA_FD(5), PFM_IA_FD(6), PFM_IA_FD(7),
+/* pmd24 */ PFM_IA_FD(8), PFM_IA_FD(9), PFM_IA_FD(10), PFM_IA_FD(11),
+/* pmd28 */ PFM_IA_FD(16), PFM_IA_FD(17), PFM_IA_FD(18), PFM_IA_FD(19)
+};
+#define PFM_IA_MAX_PMDS ARRAY_SIZE(pfm_intel_arch_pmd_desc)
+
+#define PFM_IA_MAX_CNT 16 /* # generic counters in mapping table */
+#define PFM_IA_MAX_FCNT 16 /* # of fixed counters in mapping table */
+#define PFM_IA_FCNT_BASE 16 /* base index of fixed counters PMD */
+
+static struct pfm_pmu_config pfm_intel_arch_pmu_conf;
+
+static void pfm_intel_arch_check_errata(void)
+{
+ /*
+ * Core Duo errata AE49 (no fix). Both counters share a single
+ * enable bit in PERFEVTSEL0
+ */
+ if (current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 14)
+ pfm_intel_arch_pmu_info.flags |= PFM_X86_FL_NO_SHARING;
+}
+
+static inline void set_enable_mask(unsigned int i)
+{
+ pfm_arch_bv_set_bit(i, enable_mask);
+
+ /* max_enable = highest + 1 */
+ if ((i+1) > max_enable)
+ max_enable = i+ 1;
+}
+
+static void pfm_intel_arch_setup_generic(unsigned int version,
+ unsigned int width,
+ unsigned int count)
+{
+ u64 rsvd;
+ unsigned int i;
+
+ /*
+ * first we handle the generic counters:
+ *
+ * - ensure HW does not have more registers than hardcoded in the tables
+ * - adjust rsvd_msk to actual counter width
+ * - initialize enable_mask (list of PMC with start/stop capability)
+ * - mark unused hardcoded generic counters as unimplemented
+ */
+
+ /*
+ * min of number of Hw counters and hardcoded in the tables
+ */
+ if (count >= PFM_IA_MAX_CNT) {
+ printk(KERN_INFO "perfmon: Limiting number of generic counters"
+ " to %u, HW supports %u",
+ PFM_IA_MAX_CNT, count);
+ count = PFM_IA_MAX_CNT;
+ }
+
+ /*
+ * adjust rsvd_msk for generic counters based on actual width
+ * initialize enable_mask (1 per pmd)
+ */
+ rsvd = ~((1ULL << width)-1);
+ for (i = 0; i < count; i++) {
+ pfm_intel_arch_pmd_desc[i].rsvd_msk = rsvd;
+ set_enable_mask(i);
+ }
+
+ /*
+ * handle version 3 new anythread bit (21)
+ */
+ if (version == 3) {
+ for (i = 0; i < count; i++)
+ pfm_intel_arch_pmc_desc[i].rsvd_msk &= ~(1ULL << 21);
+ }
+
+
+ /*
+ * mark unused generic counters as not available
+ */
+ for (i = count ; i < PFM_IA_MAX_CNT; i++) {
+ pfm_intel_arch_pmd_desc[i].type = PFM_REG_NA;
+ pfm_intel_arch_pmc_desc[i].type = PFM_REG_NA;
+ }
+}
+
+static void pfm_intel_arch_setup_fixed(unsigned int version,
+ unsigned int width,
+ unsigned int count)
+{
+ u64 rsvd, dfl;
+ unsigned int i;
+
+ /*
+ * handle the fixed counters (if any):
+ *
+ * - ensure HW does not have more registers than hardcoded in the tables
+ * - adjust rsvd_msk to actual counter width
+ * - initialize enable_mask (list of PMC with start/stop capability)
+ * - mark unused hardcoded generic counters as unimplemented
+ */
+ if (count >= PFM_IA_MAX_FCNT) {
+ printk(KERN_INFO "perfmon: Limiting number of fixed counters"
+ " to %u, HW supports %u",
+ PFM_IA_MAX_FCNT, count);
+ count = PFM_IA_MAX_FCNT;
+ }
+ /*
+ * adjust rsvd_msk for fixed counters based on actual width
+ */
+ rsvd = ~((1ULL << width)-1);
+ for (i = 0; i < count; i++)
+ pfm_intel_arch_pmd_desc[PFM_IA_FCNT_BASE+i].rsvd_msk = rsvd;
+
+ /*
+ * handle version new anythread bit (bit 2)
+ */
+ if (version == 3)
+ rsvd = 1ULL << 3;
+ else
+ rsvd = 3ULL << 2;
+
+ pfm_intel_arch_pmc_desc[16].rsvd_msk = 0;
+ for (i = 0; i < count; i++)
+ pfm_intel_arch_pmc_desc[16].rsvd_msk |= rsvd << (i<<2);
+
+ /*
+ * mark unused fixed counters as unimplemented
+ *
+ * update the rsvd_msk, dfl_val in FIXED_CTRL:
+ * - rsvd_msk: set all 4 bits
+ * - dfl_val : clear all 4 bits
+ */
+ dfl = pfm_intel_arch_pmc_desc[16].dfl_val;
+ rsvd = pfm_intel_arch_pmc_desc[16].rsvd_msk;
+
+ for (i = count ; i < PFM_IA_MAX_FCNT; i++) {
+ pfm_intel_arch_pmd_desc[PFM_IA_FCNT_BASE+i].type = PFM_REG_NA;
+ rsvd |= 0xfULL << (i<<2);
+ dfl &= ~(0xfULL << (i<<2));
+ }
+
+ /*
+ * FIXED_CTR_CTRL unavailable when no fixed counters are defined
+ */
+ if (!count) {
+ pfm_intel_arch_pmc_desc[16].type = PFM_REG_NA;
+ } else {
+ /* update rsvd_mask and dfl_val */
+ pfm_intel_arch_pmc_desc[16].rsvd_msk = rsvd;
+ pfm_intel_arch_pmc_desc[16].dfl_val = dfl;
+ set_enable_mask(16);
+ }
+}
+
+static int pfm_intel_arch_probe_pmu(void)
+{
+ union {
+ unsigned int val;
+ struct pmu_eax eax;
+ struct pmu_edx edx;
+ } eax, edx;
+ unsigned int ebx, ecx;
+ unsigned int width = 0;
+
+ edx.val = 0;
+
+ if (!cpu_has_arch_perfmon) {
+ PFM_INFO("no support for Intel architectural PMU");
+ return -1;
+ }
+
+ if (!cpu_has_apic) {
+ PFM_INFO("no Local APIC, try rebooting with lapic option");
+ return -1;
+ }
+
+ /* cpuid() call protected by cpu_has_arch_perfmon */
+ cpuid(0xa, &eax.val, &ebx, &ecx, &edx.val);
+
+ /*
+ * some 6/15 models have buggy BIOS
+ */
+ if (eax.eax.version == 0
+ && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 15) {
+ PFM_INFO("buggy v2 BIOS, adjusting for 2 generic counters");
+ eax.eax.version = 2;
+ eax.eax.num_cnt = 2;
+ eax.eax.cnt_width = 40;
+ }
+
+ /*
+ * some v2 BIOSes are incomplete
+ */
+ if (eax.eax.version == 2 && !edx.edx.num_cnt) {
+ PFM_INFO("buggy v2 BIOS, adjusting for 3 fixed counters");
+ edx.edx.num_cnt = 3;
+ edx.edx.cnt_width = 40;
+ }
+
+ /*
+ * no fixed counters on earlier versions
+ */
+ if (eax.eax.version < 2) {
+ edx.val = 0;
+ } else {
+ /*
+ * use the min value of both widths until we support
+ * variable width counters
+ */
+ width = eax.eax.cnt_width < edx.edx.cnt_width ?
+ eax.eax.cnt_width : edx.edx.cnt_width;
+ }
+
+ /*
+ * Intel Atom processors have a buggy firmware which does not report
+ * the correct number of fixed counters
+ */
+ if (eax.eax.version == 3 && edx.edx.num_cnt < 3
+ && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 28) {
+ PFM_INFO("buggy v3 BIOS, adjusting for 3 fixed counters");
+ edx.edx.num_cnt = 3;
+ }
+
+ PFM_INFO("detected architecural perfmon v%d", eax.eax.version);
+ PFM_INFO("num_gen=%d width=%d num_fixed=%d width=%d",
+ eax.eax.num_cnt,
+ eax.eax.cnt_width,
+ edx.edx.num_cnt,
+ edx.edx.cnt_width);
+
+ pfm_intel_arch_setup_generic(eax.eax.version,
+ width,
+ eax.eax.num_cnt);
+
+ pfm_intel_arch_setup_fixed(eax.eax.version,
+ width,
+ edx.edx.num_cnt);
+
+ pfm_intel_arch_check_errata();
+
+ pfm_intel_arch_version = eax.eax.version;
+
+ return 0;
+}
+
+/**
+ * pfm_intel_arch_has_ovfls - check for pending overflow condition
+ * @ctx: context to work on
+ *
+ * detect if counters have overflowed.
+ * return:
+ * 0 : no overflow
+ * 1 : at least one overflow
+ */
+static int __kprobes pfm_intel_arch_has_ovfls(struct pfm_context *ctx)
+{
+ u64 *cnt_mask;
+ u64 wmask, val;
+ u16 i, num;
+
+ cnt_mask = ctx->regs.cnt_pmds;
+ num = ctx->regs.num_counters;
+ wmask = 1ULL << pfm_pmu_conf->counter_width;
+
+ /*
+ * we can leverage the fact that we know the mapping
+ * to hardcode the MSR address and avoid accessing
+ * more cachelines
+ *
+ * We need to check cnt_mask because not all registers
+ * may be available.
+ */
+ for (i = 0; num; i++) {
+ if (pfm_arch_bv_test_bit(i, cnt_mask)) {
+ rdmsrl(pfm_intel_arch_pmd_desc[i].hw_addr, val);
+ if (!(val & wmask))
+ return 1;
+ num--;
+ }
+ }
+ return 0;
+}
+
+static int pfm_intel_arch_stop_save(struct pfm_context *ctx,
+ struct pfm_event_set *set)
+{
+ u64 used_mask[PFM_PMC_BV];
+ u64 val, wmask, ovfl_mask;
+ u32 i, count;
+
+ wmask = 1ULL << pfm_pmu_conf->counter_width;
+
+ pfm_arch_bv_and(used_mask,
+ set->used_pmcs,
+ enable_mask,
+ max_enable);
+
+ count = pfm_arch_bv_weight(used_mask, max_enable);
+
+ /*
+ * stop monitoring
+ * Unfortunately, this is very expensive!
+ * wrmsrl() is serializing.
+ */
+ for (i = 0; count; i++) {
+ if (pfm_arch_bv_test_bit(i, used_mask)) {
+ wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0);
+ count--;
+ }
+ }
+
+ /*
+ * if we already having a pending overflow condition, we simply
+ * return to take care of this first.
+ */
+ if (set->npend_ovfls)
+ return 1;
+
+ ovfl_mask = pfm_pmu_conf->ovfl_mask;
+
+ /*
+ * check for pending overflows and save PMDs (combo)
+ * we employ used_pmds because we also need to save
+ * and not just check for pending interrupts.
+ *
+ * all pmds are counters
+ */
+ count = set->nused_pmds;
+ for (i = 0; count; i++) {
+ if (pfm_arch_bv_test_bit(i, set->used_pmds)) {
+ val = pfm_arch_read_pmd(ctx, i);
+ if (!(val & wmask)) {
+ pfm_arch_bv_set_bit(i, set->povfl_pmds);
+ set->npend_ovfls++;
+ }
+ val = (set->pmds[i] & ~ovfl_mask)
+ | (val & ovfl_mask);
+ set->pmds[i] = val;
+ count--;
+ }
+ }
+ /* 0 means: no need to save PMDs at upper level */
+ return 0;
+}
+
+/**
+ * pfm_intel_arch_quiesce - stop monitoring without grabbing any lock
+ *
+ * called from NMI interrupt handler to immediately stop monitoring
+ * cannot grab any lock, including perfmon related locks
+ */
+static void __kprobes pfm_intel_arch_quiesce(void)
+{
+ u16 i;
+
+ /*
+ * PMC16 is the fixed control register so it has a
+ * distinct MSR address
+ *
+ * We do not use the hw_addr field in the table to avoid touching
+ * too many cachelines
+ */
+ for (i = 0; i < pfm_pmu_conf->regs_all.max_pmc; i++) {
+ if (pfm_arch_bv_test_bit(i, pfm_pmu_conf->regs_all.pmcs)) {
+ if (i == 16)
+ wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
+ else
+ wrmsrl(MSR_P6_EVNTSEL0+i, 0);
+ }
+ }
+}
+/**
+* pfm_intel_arch_release_pmu_percpu - release PMU resource per CPU
+*
+* Since v2, there exists global control MSR, to start/stop and
+* also collect overflow status information. In particular,
+* GLOBAL_CTRL controls start/stop and has one bit per counter.
+* To maintain backward compatibility with v1, the power-on value
+* of GLOBAL_CTRL should be such that generic counters are enabled
+* but fixed counters are disabled (true on Penryn and Atom currently).
+*
+* Here, we simply make sure that all available counters are enabled.
+* After that, start/stop is controlled on a per-counter basis.
+*/
+static void pfm_intel_arch_acquire_pmu_percpu(void)
+{
+ struct pfm_regmap_desc *d;
+ u64 mask = 0;
+ unsigned int i;
+
+ /* nothing to do for v1 */
+ if (pfm_intel_arch_version < 2)
+ return;
+
+ /*
+ * build bitmask of registers that are available to
+ * us. In some cases, there may be fewer registers than
+ * what the PMU supports due to sharing with other kernel
+ * subsystems, such as NMI
+ */
+ d = pfm_pmu_conf->pmd_desc;
+ for (i=0; i < 16; i++) {
+ if ((d[i].type & PFM_REG_I) == 0)
+ continue;
+ mask |= 1ull << i;
+ }
+ for (i=16; i < PFM_IA_MAX_PMDS; i++) {
+ if ((d[i].type & PFM_REG_I) == 0)
+ continue;
+ mask |= 1ull << (32+i-16);
+ }
+ /*
+ * keep a local copy of the current MSR_CORE_PERF_GLOBAL_CTRL
+ */
+ rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, __get_cpu_var(saved_global_ctrl));
+
+ PFM_DBG("global=0x%llx set to 0x%llx",
+ __get_cpu_var(saved_global_ctrl),
+ mask);
+ /*
+ * enable all registers
+ *
+ * No need to quiesce PMU. If there is a overflow, it will be
+ * treated as spurious by the handler
+ */
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, mask);
+}
+
+/**
+* pfm_intel_arch_release_pmu_percpu - release PMU resource per CPU
+*
+* Since v2, there exists global control MSR, to start/stop and
+* also collect overflow status information. In particular,
+* GLOBAL_CTRL controls start/stop and has one bit per counter.
+* To maintain backward compatibility with v1, the power-on value
+* of GLOBAL_CTRL should be such that generic counters are enabled
+* but fixed counters are disabled (true on Penryn and Atom currently).
+*
+* Here, we are done using the PMU. so we restore the power-on value.
+*/
+static void pfm_intel_arch_release_pmu_percpu(void)
+{
+ /* nothing to do for v1 */
+ if (pfm_intel_arch_version < 2)
+ return;
+
+ PFM_DBG("global_ctrl restored to 0x%llx\n",
+ __get_cpu_var(saved_global_ctrl));
+
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, __get_cpu_var(saved_global_ctrl));
+}
+
+/*
+ * Counters may have model-specific width. Yet the documentation says
+ * that only the lower 32 bits can be written to due to the specification
+ * of wrmsr. bits [32-(w-1)] are sign extensions of bit 31. Bits [w-63] must
+ * not be set (see rsvd_msk for PMDs). As such the effective width of a
+ * counter is 31 bits only regardless of what CPUID.0xa returns.
+ *
+ * See IA-32 Intel Architecture Software developer manual Vol 3B chapter 18
+ */
+static struct pfm_pmu_config pfm_intel_arch_pmu_conf = {
+ .pmu_name = "Intel architectural",
+ .pmd_desc = pfm_intel_arch_pmd_desc,
+ .counter_width = 31,
+ .num_pmc_entries = PFM_IA_MAX_PMCS,
+ .num_pmd_entries = PFM_IA_MAX_PMDS,
+ .pmc_desc = pfm_intel_arch_pmc_desc,
+ .version = "1.0",
+ .pmu_info = &pfm_intel_arch_pmu_info
+};
+
+static int __init pfm_intel_arch_pmu_init_module(void)
+{
+ if (pfm_intel_arch_probe_pmu())
+ return -ENOSYS;
+
+ return pfm_pmu_register(&pfm_intel_arch_pmu_conf);
+}
+
+device_initcall(pfm_intel_arch_pmu_init_module);
diff --git a/include/linux/perfmon.h b/include/linux/perfmon.h
new file mode 100644
index 000000000000..6117e605a43b
--- /dev/null
+++ b/include/linux/perfmon.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+
+#ifndef __LINUX_PERFMON_H__
+#define __LINUX_PERFMON_H__
+
+/*
+ * This file contains all the user visible generic definitions for the
+ * interface. Model-specific user-visible definitions are located in
+ * the asm/perfmon.h file.
+ */
+
+/*
+ * include arch-specific user interface definitions
+ */
+#include <asm/perfmon.h>
+
+/*
+ * defined by each arch
+ */
+#define PFM_MAX_PMCS PFM_ARCH_MAX_PMCS
+#define PFM_MAX_PMDS PFM_ARCH_MAX_PMDS
+
+/*
+ * number of elements for each type of bitvector
+ * all bitvectors use u64 fixed size type on all architectures.
+ */
+#define PFM_BVSIZE(x) (((x)+(sizeof(__u64)<<3)-1) / (sizeof(__u64)<<3))
+#define PFM_PMD_BV PFM_BVSIZE(PFM_MAX_PMDS)
+#define PFM_PMC_BV PFM_BVSIZE(PFM_MAX_PMCS)
+
+/*
+ * argument to pfm_create
+ * populated on return
+ */
+struct pfarg_sinfo {
+ __u64 sif_avail_pmcs[PFM_PMC_BV];/* out: available PMCs */
+ __u64 sif_avail_pmds[PFM_PMD_BV];/* out: available PMDs */
+ __u64 sif_reserved1[4]; /* for future use */
+};
+
+/*
+ * PMC and PMD generic register description
+ */
+struct pfarg_pmr {
+ __u16 reg_num; /* which register */
+ __u16 reg_res1; /* reserved */
+ __u32 reg_flags; /* REGFL flags */
+ __u64 reg_value; /* 64-bit value */
+};
+
+/*
+ * pfm_write, pfm_read type:
+ */
+#define PFM_RW_PMD 0x01 /* accessing PMD registers */
+#define PFM_RW_PMC 0x02 /* accessing PMC registers */
+
+/*
+ * pfm_set_state state:
+ */
+#define PFM_ST_START 0x01 /* start monitoring */
+#define PFM_ST_STOP 0x02 /* stop monitoring */
+
+/*
+ * pfm_attach special target to trigger detach
+ */
+#define PFM_NO_TARGET -1 /* detach session target */
+
+/*
+ * default value for the user and group security parameters in
+ * /proc/sys/kernel/perfmon/sys_group
+ * /proc/sys/kernel/perfmon/task_group
+ */
+#define PFM_GROUP_PERM_ANY -1 /* any user/group */
+
+/*
+ * perfmon version number
+ */
+#define PFM_VERSION_MAJ 3U
+#define PFM_VERSION_MIN 0U
+#define PFM_VERSION (((PFM_VERSION_MAJ&0xffff)<<16)|\
+ (PFM_VERSION_MIN & 0xffff))
+#define PFM_VERSION_MAJOR(x) (((x)>>16) & 0xffff)
+#define PFM_VERSION_MINOR(x) ((x) & 0xffff)
+
+#endif /* __LINUX_PERFMON_H__ */
diff --git a/include/linux/perfmon_kern.h b/include/linux/perfmon_kern.h
new file mode 100644
index 000000000000..e21cd835bd2c
--- /dev/null
+++ b/include/linux/perfmon_kern.h
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+
+#ifndef __LINUX_PERFMON_KERN_H__
+#define __LINUX_PERFMON_KERN_H__
+/*
+ * This file contains all the definitions of data structures, variables, macros
+ * that are to be shared between generic code and arch-specific code
+ *
+ * For generic only definitions, use perfmon/perfmon_priv.h
+ */
+#include <linux/file.h>
+#include <linux/sched.h>
+#include <linux/perfmon.h>
+
+#ifdef CONFIG_PERFMON
+
+/*
+ * system adminstrator configuration controls available via
+ * the /sys/kerne/perfmon interface
+ */
+struct pfm_controls {
+ u32 debug; /* debugging control bitmask */
+ gid_t task_group; /* gid to create a per-task context */
+ size_t arg_mem_max; /* maximum vector argument size */
+};
+extern struct pfm_controls pfm_controls;
+
+/*
+ * event_set: encapsulates the full PMU state
+ */
+struct pfm_event_set {
+ u16 nused_pmds; /* max number of used PMDs */
+ u16 nused_pmcs; /* max number of used PMCs */
+ u32 priv_flags; /* private flags (see below) */
+ u32 npend_ovfls; /* number of pending PMD overflow */
+ u32 pad1; /* padding */
+ u64 used_pmds[PFM_PMD_BV]; /* used PMDs */
+ u64 povfl_pmds[PFM_PMD_BV]; /* pending overflowed PMDs */
+ u64 used_pmcs[PFM_PMC_BV]; /* used PMCs */
+ u64 pmcs[PFM_MAX_PMCS]; /* PMC values */
+ u64 pmds[PFM_MAX_PMDS]; /* PMD values */
+};
+
+/*
+ * common private event set flags (priv_flags)
+ *
+ * upper 16 bits: for arch-specific use
+ * lower 16 bits: for common use
+ */
+#define PFM_SETFL_PRIV_MOD_PMDS 0x1 /* PMD register(s) modified */
+#define PFM_SETFL_PRIV_MOD_PMCS 0x2 /* PMC register(s) modified */
+#define PFM_SETFL_PRIV_MOD_BOTH (PFM_SETFL_PRIV_MOD_PMDS \
+ | PFM_SETFL_PRIV_MOD_PMCS)
+
+
+/*
+ * context flags
+ */
+struct pfm_context_flags {
+ unsigned int started:1; /* pfm_start() issued */
+ unsigned int is_self:1; /* per-thread and self-montoring */
+ unsigned int work_type:2; /* type of work for pfm_handle_work */
+ unsigned int reserved:28; /* for future use */
+};
+/*
+ * values for work_type (TIF_PERFMON_WORK must be set)
+ */
+#define PFM_WORK_NONE 0 /* nothing to do */
+#define PFM_WORK_ZOMBIE 1 /* cleanup zombie context */
+
+
+/*
+ * perfmon context state
+ */
+#define PFM_CTX_UNLOADED 1 /* context is detached */
+#define PFM_CTX_LOADED 2 /* context is attached */
+#define PFM_CTX_ZOMBIE 3 /* context lost owner but still attached */
+
+/*
+ * registers description
+ */
+struct pfm_regdesc {
+ u64 pmcs[PFM_PMC_BV]; /* available PMC */
+ u64 pmds[PFM_PMD_BV]; /* available PMD */
+ u64 rw_pmds[PFM_PMD_BV]; /* available RW PMD */
+ u64 intr_pmds[PFM_PMD_BV]; /* PMD generating intr */
+ u64 cnt_pmds[PFM_PMD_BV]; /* PMD counters */
+ u16 max_pmc; /* highest+1 avail PMC */
+ u16 max_pmd; /* highest+1 avail PMD */
+ u16 max_rw_pmd; /* highest+1 avail RW PMD */
+ u16 first_intr_pmd; /* first intr PMD */
+ u16 max_intr_pmd; /* highest+1 intr PMD */
+ u16 num_rw_pmd; /* number of avail RW PMD */
+ u16 num_pmcs; /* number of logical PMCS */
+ u16 num_pmds; /* number of logical PMDS */
+ u16 num_counters; /* number of counting PMD */
+};
+
+
+/*
+ * context: contains all the state of a session
+ */
+struct pfm_context {
+ spinlock_t lock; /* context protection */
+
+ struct pfm_context_flags flags;
+ u32 state; /* current state */
+ struct task_struct *task; /* attached task */
+
+ u64 last_act; /* last activation */
+ u32 last_cpu; /* last CPU used (SMP only) */
+
+ struct pfm_event_set *active_set; /* active set */
+ struct pfm_event_set _set0; /* event set 0 */
+
+ struct pfm_regdesc regs; /* registers available to context */
+};
+
+/*
+ * logging
+ */
+#define PFM_ERR(f, x...) printk(KERN_ERR "perfmon: " f "\n", ## x)
+#define PFM_WARN(f, x...) printk(KERN_WARNING "perfmon: " f "\n", ## x)
+#define PFM_LOG(f, x...) printk(KERN_NOTICE "perfmon: " f "\n", ## x)
+#define PFM_INFO(f, x...) printk(KERN_INFO "perfmon: " f "\n", ## x)
+
+/*
+ * debugging
+ *
+ * Printk rate limiting is enforced to avoid getting flooded with too many
+ * error messages on the console (which could render the machine unresponsive).
+ * To get full debug output (turn off ratelimit):
+ * $ echo 0 >/proc/sys/kernel/printk_ratelimit
+ *
+ * debug is a bitmask where bits are defined as follows:
+ * bit 0: enable non-interrupt code degbug messages
+ * bit 1: enable interrupt code debug messages
+ */
+#ifdef CONFIG_PERFMON_DEBUG
+#define _PFM_DBG(lm, f, x...) \
+ do { \
+ if (unlikely((pfm_controls.debug & lm) && printk_ratelimit())) { \
+ printk("perfmon: %s.%d: CPU%d [%d]: " f "\n", \
+ __func__, __LINE__, \
+ smp_processor_id(), current->pid , ## x); \
+ } \
+ } while (0)
+
+#define PFM_DBG(f, x...) _PFM_DBG(0x1, f, ##x)
+#define PFM_DBG_ovfl(f, x...) _PFM_DBG(0x2, f, ##x)
+#else
+#define PFM_DBG(f, x...) do {} while (0)
+#define PFM_DBG_ovfl(f, x...) do {} while (0)
+#endif
+
+extern struct pfm_pmu_config *pfm_pmu_conf;
+extern int perfmon_disabled;
+
+static inline struct pfm_arch_context *pfm_ctx_arch(struct pfm_context *c)
+{
+ return (struct pfm_arch_context *)(c+1);
+}
+
+#include <linux/perfmon_pmu.h>
+
+extern const struct file_operations pfm_file_ops;
+
+void pfm_handle_work(struct pt_regs *regs);
+void __pfm_exit_thread(void);
+void pfm_ctxsw_in(struct task_struct *prev, struct task_struct *next);
+void pfm_ctxsw_out(struct task_struct *prev, struct task_struct *next);
+void __pfm_init_percpu(void *dummy);
+
+void pfm_interrupt_handler(unsigned long ip, struct pt_regs *regs);
+
+int pfm_session_allcpus_acquire(void);
+void pfm_session_allcpus_release(void);
+
+static inline void pfm_exit_thread(void)
+{
+ if (current->pfm_context)
+ __pfm_exit_thread();
+}
+
+/*
+ * include arch-specific kernel level definitions
+ */
+#include <asm/perfmon_kern.h>
+
+static inline void pfm_copy_thread(struct task_struct *task)
+{
+ /*
+ * context or perfmon TIF state is NEVER inherited
+ * in child task. Holds for per-thread and system-wide
+ */
+ task->pfm_context = NULL;
+ clear_tsk_thread_flag(task, TIF_PERFMON_CTXSW);
+}
+
+/*
+ * read a single PMD register.
+ */
+static inline u64 pfm_read_pmd(struct pfm_context *ctx, unsigned int cnum)
+{
+ return pfm_arch_read_pmd(ctx, cnum);
+}
+/*
+ * write a single PMD register.
+ */
+static inline void pfm_write_pmd(struct pfm_context *ctx, unsigned int cnum,
+ u64 value)
+{
+ /*
+ * PMD writes are ignored for read-only registers
+ */
+ if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_RO)
+ return;
+
+ /*
+ * clear unimplemented bits
+ */
+ value &= ~pfm_pmu_conf->pmd_desc[cnum].rsvd_msk;
+
+ pfm_arch_write_pmd(ctx, cnum, value);
+}
+
+DECLARE_PER_CPU(struct pfm_context *, pmu_ctx);
+DECLARE_PER_CPU(struct task_struct *, pmu_owner);
+
+/*
+ * number of u64 to use for stack buffer in
+ * syscalls which take vector argument
+ */
+#ifndef PFM_ARCH_STK_ARG
+#define PFM_ARCH_STK_ARG 2
+#endif
+
+#define PFM_STK_ARG PFM_ARCH_STK_ARG
+
+#else /* !CONFIG_PERFMON */
+/*
+ * perfmon hooks are nops when CONFIG_PERFMON is undefined
+ */
+
+static inline void pfm_exit_thread(void)
+{}
+
+static inline void pfm_handle_work(struct pt_regs *regs)
+{}
+
+static inline void pfm_copy_thread(struct task_struct *t)
+{}
+
+static inline void pfm_ctxsw_in(struct task_struct *p, struct task_struct *n)
+{}
+
+static inline void pfm_ctxsw_out(struct task_struct *p, struct task_struct *n)
+{}
+
+static inline void pfm_session_allcpus_release(void)
+{}
+
+static inline int pfm_session_allcpus_acquire(void)
+{
+ return 0;
+}
+#endif /* CONFIG_PERFMON */
+#endif /* __LINUX_PERFMON_KERN_H__ */
diff --git a/include/linux/perfmon_pmu.h b/include/linux/perfmon_pmu.h
new file mode 100644
index 000000000000..13d357140243
--- /dev/null
+++ b/include/linux/perfmon_pmu.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Interface for PMU description modules
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#ifndef __PERFMON_PMU_H__
+#define __PERFMON_PMU_H__ 1
+
+/*
+ * generic information about a PMC or PMD register
+ */
+struct pfm_regmap_desc {
+ u16 type; /* register infos */
+ u16 reserved1; /* for future use */
+ u32 reserved2; /* for future use */
+ u64 dfl_val; /* power-on default value (quiescent) */
+ u64 rsvd_msk; /* reserved bits: 1 means reserved */
+ u64 no_emul64_msk; /* bits to clear for PFM_REGFL_NO_EMUL64 */
+ unsigned long hw_addr; /* HW register address or index */
+ struct kobject kobj; /* for internal use only */
+ char *desc; /* HW register description string */
+ u64 dep_pmcs[PFM_PMC_BV];/* depending PMC registers */
+};
+
+/*
+ * pfm_reg_desc helper macros
+ */
+#define PMC_D(t, d, v, r, n, h) \
+ { .type = t, \
+ .desc = d, \
+ .dfl_val = v, \
+ .rsvd_msk = r, \
+ .no_emul64_msk = n, \
+ .hw_addr = h \
+ }
+
+#define PMD_D(t, d, h) \
+ { .type = t, \
+ .desc = d, \
+ .rsvd_msk = 0, \
+ .no_emul64_msk = 0, \
+ .hw_addr = h \
+ }
+
+#define PMD_DR(t, d, h, r) \
+ { .type = t, \
+ .desc = d, \
+ .rsvd_msk = r, \
+ .no_emul64_msk = 0, \
+ .hw_addr = h \
+ }
+
+#define PMX_NA \
+ { .type = PFM_REG_NA }
+
+/*
+ * type of a PMU register (16-bit bitmask) for use with pfm_reg_desc.type
+ */
+#define PFM_REG_NA 0x00 /* not avail. (not impl.,no access) must be 0 */
+#define PFM_REG_I 0x01 /* PMC/PMD: implemented */
+#define PFM_REG_WC 0x02 /* PMC: has write_checker */
+#define PFM_REG_C64 0x04 /* PMD: 64-bit virtualization */
+#define PFM_REG_RO 0x08 /* PMD: read-only (writes ignored) */
+#define PFM_REG_INTR 0x20 /* PMD: register can generate interrupt */
+#define PFM_REG_NO64 0x100 /* PMC: supports PFM_REGFL_NO_EMUL64 */
+
+/*
+ * define some shortcuts for common types
+ */
+#define PFM_REG_W (PFM_REG_WC|PFM_REG_I)
+#define PFM_REG_W64 (PFM_REG_WC|PFM_REG_NO64|PFM_REG_I)
+#define PFM_REG_C (PFM_REG_C64|PFM_REG_INTR|PFM_REG_I)
+#define PFM_REG_I64 (PFM_REG_NO64|PFM_REG_I)
+#define PFM_REG_IRO (PFM_REG_I|PFM_REG_RO)
+
+typedef int (*pfm_pmc_check_t)(struct pfm_context *ctx,
+ struct pfm_event_set *set,
+ struct pfarg_pmr *req);
+
+typedef int (*pfm_pmd_check_t)(struct pfm_context *ctx,
+ struct pfm_event_set *set,
+ struct pfarg_pmr *req);
+
+/*
+ * structure used by pmu description modules
+ *
+ * probe_pmu() routine return value:
+ * - 1 means recognized PMU
+ * - 0 means not recognized PMU
+ */
+struct pfm_pmu_config {
+ char *pmu_name; /* PMU family name */
+ char *version; /* config module version */
+
+ int counter_width; /* width of hardware counter */
+
+ struct pfm_regmap_desc *pmc_desc; /* PMC register descriptions */
+ struct pfm_regmap_desc *pmd_desc; /* PMD register descriptions */
+
+ pfm_pmc_check_t pmc_write_check;/* write checker (optional) */
+ pfm_pmd_check_t pmd_write_check;/* write checker (optional) */
+ pfm_pmd_check_t pmd_read_check; /* read checker (optional) */
+
+ u16 num_pmc_entries;/* #entries in pmc_desc */
+ u16 num_pmd_entries;/* #entries in pmd_desc */
+ void *pmu_info; /* model-specific infos */
+ /*
+ * fields computed internally, do not set in module
+ */
+ struct pfm_regdesc regs_all; /* regs available to all */
+ u64 ovfl_mask; /* overflow mask */
+};
+
+static inline void *pfm_pmu_info(void)
+{
+ return pfm_pmu_conf->pmu_info;
+}
+
+int pfm_pmu_register(struct pfm_pmu_config *cfg);
+
+int pfm_sysfs_add_pmu(struct pfm_pmu_config *pmu);
+
+#endif /* __PERFMON_PMU_H__ */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 755b5705cd38..8e23536e66be 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1306,6 +1306,10 @@ struct task_struct {
unsigned long default_timer_slack_ns;
struct list_head *scm_work_list;
+
+#if defined(CONFIG_PERFMON_V20) || defined(CONFIG_PERFMON)
+ struct pfm_context *pfm_context;
+#endif
};
/*
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d6ff145919ca..d12a175e0f43 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -625,4 +625,15 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
+#ifdef CONFIG_PERFMON_V20
+struct pfarg_sinfo;
+asmlinkage long sys_pfm_create(int flags, struct pfarg_sinfo *s,
+ char __user *f, void __user *uarg, size_t uarg_size);
+
+asmlinkage long sys_pfm_write(int fd, int flags, int type, void __user *arg, size_t s);
+asmlinkage long sys_pfm_read(int fd, int flags, int type, void __user *arg, size_t s);
+asmlinkage long sys_pfm_attach(int fd, int flags, int target);
+asmlinkage long sys_pfm_set_state(int fd, int flags, int state);
+#endif /* CONFIG_PERFMON_V20 */
+
#endif
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index a77b27b11b04..1432b300e1ca 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -174,3 +174,10 @@ cond_syscall(compat_sys_timerfd_settime);
cond_syscall(compat_sys_timerfd_gettime);
cond_syscall(sys_eventfd);
cond_syscall(sys_eventfd2);
+
+/* perfmon */
+cond_syscall(sys_pfm_create);
+cond_syscall(sys_pfm_write);
+cond_syscall(sys_pfm_read);
+cond_syscall(sys_pfm_attach);
+cond_syscall(sys_pfm_set_state);
diff --git a/perfmon/Makefile b/perfmon/Makefile
new file mode 100644
index 000000000000..4ee61aa50675
--- /dev/null
+++ b/perfmon/Makefile
@@ -0,0 +1,10 @@
+#
+# Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
+# Contributed by Stephane Eranian <eranian@gmail.com>
+#
+obj-y = perfmon_ctx.o perfmon_file.o \
+ perfmon_attach.o perfmon_res.o \
+ perfmon_init.o perfmon_activate.o \
+ perfmon_intr.o perfmon_rw.o \
+ perfmon_ctxsw.o perfmon_pmu.o \
+ perfmon_syscalls.o perfmon_sysfs.o
diff --git a/perfmon/perfmon_activate.c b/perfmon/perfmon_activate.c
new file mode 100644
index 000000000000..9398e7c15215
--- /dev/null
+++ b/perfmon/perfmon_activate.c
@@ -0,0 +1,136 @@
+/*
+ * perfmon_activate.c: perfmon2 start/stop functions
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * http://www.hpl.hp.com/research/linux/perfmon
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/perfmon_kern.h>
+#include "perfmon_priv.h"
+
+/**
+ * __pfm_start - activate monitoring
+ * @ctx: context to operate on
+ * @start: pfarg_start as passed by user
+ *
+ * When operating in per-thread mode and not self-monitoring, the monitored
+ * thread must be stopped. Activation will be effective next time the thread
+ * is context switched in.
+ *
+ * The pfarg_start argument is optional and may be used to designate
+ * the initial event set to activate. When not provided, the last active
+ * set is used. For the first activation, set0 is used when start is NULL.
+ *
+ * On some architectures, e.g., IA-64, it may be possible to start monitoring
+ * without calling this function under certain conditions (per-thread and self
+ * monitoring). In this case, either set0 or the last active set is used.
+ *
+ * the context is locked and interrupts are disabled.
+ */
+int __pfm_start(struct pfm_context *ctx)
+{
+ struct task_struct *task;
+ struct pfm_event_set *set;
+
+ task = ctx->task;
+
+ /*
+ * UNLOADED: error
+ * LOADED : normal start, nop if started
+ * ZOMBIE : cannot happen
+ */
+ if (ctx->state == PFM_CTX_UNLOADED)
+ return -EINVAL;
+
+ set = ctx->active_set;
+
+ /*
+ * mark as started
+ * must be done before calling pfm_arch_start()
+ */
+ ctx->flags.started = 1;
+
+ pfm_arch_start(task, ctx);
+
+ /*
+ * we check whether we had a pending ovfl before restarting.
+ * If so we need to regenerate the interrupt to make sure we
+ * keep recorded samples. For non-self monitoring this check
+ * is done in the pfm_ctxswin_thread() routine.
+ *
+ * we check new_set/old_set because pfm_switch_sets() already
+ * takes care of replaying the pending interrupts
+ */
+ if (task == current && set->npend_ovfls)
+ pfm_arch_resend_irq(ctx);
+
+ return 0;
+}
+
+/**
+ * __pfm_stop - stop monitoring
+ * @ctx: context to operate on
+ *
+ * When operating in per-thread* mode and when not self-monitoring,
+ * the monitored thread must be stopped.
+ *
+ * the context is locked and interrupts are disabled.
+ */
+int __pfm_stop(struct pfm_context *ctx)
+{
+ struct task_struct *task;
+
+ /*
+ * context must be attached (zombie cannot happen)
+ */
+ if (ctx->state == PFM_CTX_UNLOADED)
+ return -EINVAL;
+
+ task = ctx->task;
+
+ PFM_DBG("ctx_task=[%d] ctx_state=%d is_system=%d",
+ task ? task->pid : -1,
+ ctx->state,
+ !task);
+
+ pfm_arch_stop(task, ctx);
+
+ ctx->flags.started = 0;
+ /*
+ * starting now, in-flight PMU interrupt for this context
+ * are treated as spurious
+ */
+ return 0;
+}
diff --git a/perfmon/perfmon_attach.c b/perfmon/perfmon_attach.c
new file mode 100644
index 000000000000..4ef00982f218
--- /dev/null
+++ b/perfmon/perfmon_attach.c
@@ -0,0 +1,337 @@
+/*
+ * perfmon_attach.c: perfmon2 load/unload functions
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * http://www.hpl.hp.com/research/linux/perfmon
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/perfmon_kern.h>
+#include "perfmon_priv.h"
+
+/**
+ * __pfm_load_ctx_thread - attach context to a thread
+ * @ctx: context to operate on
+ * @task: thread to attach to
+ *
+ * The function must be called with the context locked and interrupts disabled.
+ */
+static int pfm_load_ctx_thread(struct pfm_context *ctx,
+ struct task_struct *task)
+{
+ struct pfm_event_set *set;
+ struct pfm_context *old;
+ int ret;
+ u16 max;
+
+ PFM_DBG("pid=%d", task->pid);
+
+ /*
+ * we must use cmpxchg to avoid race condition with another
+ * context trying to attach to the same task.
+ *
+ * per-thread:
+ * - task to attach to is checked in sys_pfm_load_context() to avoid
+ * locking issues. if found, and not self, task refcount was
+ * incremented.
+ */
+ old = cmpxchg(&task->pfm_context, NULL, ctx);
+ if (old) {
+ PFM_DBG("load_pid=%d has a context "
+ "old=%p new=%p cur=%p",
+ task->pid,
+ old,
+ ctx,
+ task->pfm_context);
+ return -EEXIST;
+ }
+
+ /*
+ * initialize sets
+ */
+ set = ctx->active_set;
+
+ /*
+ * cleanup bitvectors
+ */
+ max = ctx->regs.max_intr_pmd;
+ pfm_arch_bv_zero(set->povfl_pmds, max);
+
+ set->npend_ovfls = 0;
+
+ /*
+ * we cannot just use plain clear because of arch-specific flags
+ */
+ set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH;
+
+ /*
+ * link context to task
+ */
+ ctx->task = task;
+
+ /*
+ * perform any architecture specific actions
+ */
+ ret = pfm_arch_load_context(ctx);
+ if (ret)
+ goto error_noload;
+
+ /*
+ * now reserve the session, before we can proceed with
+ * actually accessing the PMU hardware
+ */
+ ret = pfm_session_acquire();
+ if (ret)
+ goto error;
+
+
+ if (ctx->task != current) {
+
+ /* not self-monitoring */
+ ctx->flags.is_self = 0;
+
+ /* force a full reload */
+ ctx->last_act = PFM_INVALID_ACTIVATION;
+ ctx->last_cpu = -1;
+ set->priv_flags |= PFM_SETFL_PRIV_MOD_BOTH;
+
+ } else {
+ /*
+ * on UP, we may have to push out the PMU
+ * state of the last monitored thread
+ */
+ pfm_check_save_prev_ctx();
+
+ ctx->last_cpu = smp_processor_id();
+ __get_cpu_var(pmu_activation_number)++;
+ ctx->last_act = __get_cpu_var(pmu_activation_number);
+
+ ctx->flags.is_self = 1;
+
+ /*
+ * load PMD from set
+ * load PMC from set
+ */
+ pfm_arch_restore_pmds(ctx, set);
+ pfm_arch_restore_pmcs(ctx, set);
+
+ /*
+ * set new ownership
+ */
+ pfm_set_pmu_owner(ctx->task, ctx);
+ }
+
+ /*
+ * will cause switch_to() to invoke PMU
+ * context switch code
+ */
+ set_tsk_thread_flag(task, TIF_PERFMON_CTXSW);
+
+ ctx->state = PFM_CTX_LOADED;
+
+ return 0;
+
+error:
+ pfm_arch_unload_context(ctx);
+ ctx->task = NULL;
+error_noload:
+ /*
+ * detach context
+ */
+ task->pfm_context = NULL;
+ return ret;
+}
+
+/**
+ * __pfm_load_context - attach context to a thread
+ * @ctx: context to operate on
+ * @task: thread to attach to
+ */
+int __pfm_load_context(struct pfm_context *ctx, struct task_struct *task)
+{
+ return pfm_load_ctx_thread(ctx, task);
+}
+
+/**
+ * pfm_update_ovfl_pmds - account for pending ovfls on PMDs
+ * @ctx: context to operate on
+ *
+ * This function is always called after pfm_stop has been issued
+ */
+static void pfm_update_ovfl_pmds(struct pfm_context *ctx)
+{
+ struct pfm_event_set *set;
+ u64 *cnt_pmds;
+ u64 ovfl_mask;
+ u16 num_ovfls, i;
+
+ ovfl_mask = pfm_pmu_conf->ovfl_mask;
+ cnt_pmds = ctx->regs.cnt_pmds;
+ set = ctx->active_set;
+
+ if (!set->npend_ovfls)
+ return;
+
+ num_ovfls = set->npend_ovfls;
+ PFM_DBG("novfls=%u", num_ovfls);
+
+ for (i = 0; num_ovfls; i++) {
+ if (pfm_arch_bv_test_bit(i, set->povfl_pmds)) {
+ /* only correct value for counters */
+ if (pfm_arch_bv_test_bit(i, cnt_pmds))
+ set->pmds[i] += 1 + ovfl_mask;
+ num_ovfls--;
+ }
+ PFM_DBG("pmd%u val=0x%llx",
+ i,
+ (unsigned long long)set->pmds[i]);
+ }
+ /*
+ * we need to clear to prevent a pfm_getinfo_evtsets() from
+ * returning stale data even after the context is unloaded
+ */
+ set->npend_ovfls = 0;
+ pfm_arch_bv_zero(set->povfl_pmds, ctx->regs.max_intr_pmd);
+}
+
+/**
+ * __pfm_unload_context - detach context from CPU or thread
+ * @ctx: context to operate on
+ *
+ * The function must be called with the context locked and interrupts disabled.
+ */
+int __pfm_unload_context(struct pfm_context *ctx)
+{
+ int ret;
+
+ PFM_DBG("ctx_state=%d task [%d]",
+ ctx->state,
+ ctx->task ? ctx->task->pid : -1);
+
+ /*
+ * check unload-able state
+ */
+ if (ctx->state == PFM_CTX_UNLOADED)
+ return -EINVAL;
+
+ /*
+ * stop monitoring
+ */
+ ret = __pfm_stop(ctx);
+ if (ret)
+ return ret;
+
+ ctx->state = PFM_CTX_UNLOADED;
+
+ /*
+ * save active set
+ * UP:
+ * if not current task and due to lazy, state may
+ * still be live
+ * for system-wide, guaranteed to run on correct CPU
+ */
+ if (__get_cpu_var(pmu_ctx) == ctx) {
+ /*
+ * pending overflows have been saved by pfm_stop()
+ */
+ pfm_save_pmds(ctx);
+ pfm_set_pmu_owner(NULL, NULL);
+ PFM_DBG("released ownership");
+ }
+
+ /*
+ * account for pending overflows
+ */
+ pfm_update_ovfl_pmds(ctx);
+
+ /*
+ * arch-specific unload operations
+ */
+ pfm_arch_unload_context(ctx);
+
+ /*
+ * per-thread: disconnect from monitored task
+ */
+ if (ctx->task) {
+ ctx->task->pfm_context = NULL;
+ clear_tsk_thread_flag(ctx->task, TIF_PERFMON_CTXSW);
+ ctx->task = NULL;
+ }
+ return 0;
+}
+
+/**
+ * __pfm_exit_thread - detach and free context on thread exit
+ */
+void __pfm_exit_thread(void)
+{
+ struct pfm_context *ctx;
+ unsigned long flags;
+ int free_ok = 0, ret = -1;
+
+ ctx = current->pfm_context;
+
+ spin_lock_irqsave(&ctx->lock, flags);
+
+ PFM_DBG("state=%d is_self=%d", ctx->state, ctx->flags.is_self);
+
+ /*
+ * __pfm_unload_context() cannot fail
+ * in the context states we are interested in
+ */
+ switch (ctx->state) {
+ case PFM_CTX_LOADED:
+ ret = __pfm_unload_context(ctx);
+ break;
+ case PFM_CTX_ZOMBIE:
+ ret = __pfm_unload_context(ctx);
+ free_ok = 1;
+ break;
+ default:
+ BUG_ON(ctx->state != PFM_CTX_LOADED);
+ break;
+ }
+ spin_unlock_irqrestore(&ctx->lock, flags);
+
+ if (!ret)
+ pfm_session_release();
+
+ /*
+ * All memory free operations (especially for vmalloc'ed memory)
+ * MUST be done with interrupts ENABLED.
+ */
+ if (free_ok)
+ pfm_free_context(ctx);
+}
diff --git a/perfmon/perfmon_ctx.c b/perfmon/perfmon_ctx.c
new file mode 100644
index 000000000000..985977069a40
--- /dev/null
+++ b/perfmon/perfmon_ctx.c
@@ -0,0 +1,400 @@
+/*
+ * perfmon_ctx.c: perfmon2 context functions
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * http://www.hpl.hp.com/research/linux/perfmon
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/fdtable.h>
+#include <linux/perfmon_kern.h>
+#include "perfmon_priv.h"
+
+/*
+ * context memory pool pointer
+ */
+static struct kmem_cache *pfm_ctx_cachep;
+
+/*
+ * This function is called when we need to perform asynchronous
+ * work on a context. This function is called ONLY when about to
+ * return to user mode (very much like with signal handling).
+ *
+ * we come here if:
+ *
+ * - we are zombie and we need to cleanup our state
+ *
+ * pfm_handle_work() can be called with interrupts enabled
+ * (TIF_NEED_RESCHED) or disabled.
+ */
+void pfm_handle_work(struct pt_regs *regs)
+{
+ struct pfm_context *ctx;
+ unsigned long flags;
+ int type;
+
+ if (!user_mode(regs))
+ return;
+
+ clear_thread_flag(TIF_PERFMON_WORK);
+
+ ctx = current->pfm_context;
+ if (ctx == NULL) {
+ PFM_DBG("[%d] has no ctx", current->pid);
+ return;
+ }
+
+ spin_lock_irqsave(&ctx->lock, flags);
+
+ type = ctx->flags.work_type;
+ ctx->flags.work_type = PFM_WORK_NONE;
+
+ PFM_DBG("work_type=%d", type);
+
+ switch (type) {
+ case PFM_WORK_ZOMBIE:
+ goto do_zombie;
+ default:
+ PFM_DBG("unkown type=%d", type);
+ goto nothing_todo;
+ }
+nothing_todo:
+ /*
+ * restore flags as they were upon entry
+ */
+ spin_unlock_irqrestore(&ctx->lock, flags);
+ return;
+
+do_zombie:
+ PFM_DBG("context is zombie, bailing out");
+
+ /* always returns 0 in this case */
+ __pfm_unload_context(ctx);
+
+ /*
+ * keep the spinlock check happy
+ */
+ spin_unlock(&ctx->lock);
+
+ /*
+ * enable interrupt for vfree()
+ */
+ local_irq_enable();
+
+ /*
+ * actual context free
+ */
+ pfm_free_context(ctx);
+
+ /*
+ * restore interrupts as they were upon entry
+ */
+ local_irq_restore(flags);
+
+ /*
+ * pfm_unload always successful, so can release
+ * session safely
+ */
+ pfm_session_release();
+}
+
+/**
+ * pfm_free_context - de-allocate context and associated resources
+ * @ctx: context to free
+ */
+void pfm_free_context(struct pfm_context *ctx)
+{
+ pfm_arch_context_free(ctx);
+
+ PFM_DBG("free ctx @0x%p", ctx);
+ kmem_cache_free(pfm_ctx_cachep, ctx);
+ /*
+ * decrease refcount on:
+ * - PMU description table
+ */
+ pfm_pmu_release();
+}
+
+/**
+ * pfm_init_ctx -- initialize context SLAB
+ *
+ * called from pfm_init
+ */
+int __init pfm_init_ctx(void)
+{
+ pfm_ctx_cachep = kmem_cache_create("pfm_context",
+ sizeof(struct pfm_context)+PFM_ARCH_CTX_SIZE,
+ SLAB_HWCACHE_ALIGN, 0, NULL);
+ if (!pfm_ctx_cachep) {
+ PFM_ERR("cannot initialize context slab");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+/**
+ * pfm_ctx_permissions - check authorization to create new context
+ * @ctx_flags: context flags passed by user
+ *
+ * check for permissions to create a context.
+ *
+ * A sysadmin may decide to restrict creation of per-thread
+ * context to a group of users using the group id via
+ * /sys/kernel/perfmon/task_group
+ *
+ * Once we identify a user level package which can be used
+ * to grant/revoke Linux capabilites at login via PAM, we will
+ * be able to use capabilities. We would also need to increase
+ * the size of cap_t to support more than 32 capabilities (it
+ * is currently defined as u32 and 32 capabilities are alrady
+ * defined).
+ */
+static inline int pfm_ctx_permissions(u32 ctx_flags)
+{
+ if (pfm_controls.task_group != PFM_GROUP_PERM_ANY
+ && !in_group_p(pfm_controls.task_group)) {
+ PFM_DBG("user group not allowed to create a task context");
+ return -EPERM;
+ }
+ return 0;
+}
+
+/**
+ * pfm_create_initial_set - create initial set from __pfm_c reate_context
+ * @ctx: context to atatched the set to
+ */
+static void pfm_create_initial_set(struct pfm_context *ctx)
+{
+ struct pfm_event_set *set;
+ u64 *impl_pmcs;
+ u16 i, max_pmc;
+
+ set = ctx->active_set;
+ max_pmc = ctx->regs.max_pmc;
+ impl_pmcs = ctx->regs.pmcs;
+
+ /*
+ * install default values for all PMC registers
+ */
+ for (i = 0; i < max_pmc; i++) {
+ if (pfm_arch_bv_test_bit(i, impl_pmcs)) {
+ set->pmcs[i] = pfm_pmu_conf->pmc_desc[i].dfl_val;
+ PFM_DBG("pmc%u=0x%llx",
+ i,
+ (unsigned long long)set->pmcs[i]);
+ }
+ }
+ /*
+ * PMD registers are set to 0 when the event set is allocated,
+ * hence we do not need to explicitly initialize them.
+ *
+ * For virtual PMD registers (i.e., those tied to a SW resource)
+ * their value becomes meaningful once the context is attached.
+ */
+}
+
+/**
+ * __pfm_create_context - allocate and initialize a perfmon context
+ * @ctx_flags : user context flags
+ * @sif: pointer to pfarg_sinfo to be updated
+ * @new_ctx: will contain new context address on return
+ *
+ * function used to allocate a new context. A context is allocated along
+ * with the default event set. If a sampling format is used, the buffer
+ * may be allocated and initialized.
+ *
+ * The file descriptor identifying the context is allocated and returned
+ * to caller.
+ *
+ * This function operates with no locks and interrupts are enabled.
+ * return:
+ * >=0: the file descriptor to identify the context
+ * <0 : the error code
+ */
+int __pfm_create_context(__u32 ctx_flags,
+ struct pfarg_sinfo *sif,
+ struct pfm_context **new_ctx)
+{
+ struct pfm_context *ctx;
+ struct file *filp = NULL;
+ int fd = 0, ret = -EINVAL;
+
+ if (!pfm_pmu_conf)
+ return -ENOSYS;
+
+ /* no context flags supported yet */
+ if (ctx_flags)
+ goto error_alloc;
+
+ ret = pfm_ctx_permissions(ctx_flags);
+ if (ret < 0)
+ goto error_alloc;
+
+ /*
+ * we can use GFP_KERNEL and potentially sleep because we do
+ * not hold any lock at this point.
+ */
+ might_sleep();
+ ret = -ENOMEM;
+ ctx = kmem_cache_zalloc(pfm_ctx_cachep, GFP_KERNEL);
+ if (!ctx)
+ goto error_alloc;
+
+ PFM_DBG("alloc ctx @0x%p", ctx);
+
+ ctx->active_set = &ctx->_set0;
+
+ spin_lock_init(&ctx->lock);
+
+ /*
+ * context is unloaded
+ */
+ ctx->state = PFM_CTX_UNLOADED;
+
+
+ ret = pfm_pmu_acquire(ctx);
+ if (ret)
+ goto error_file;
+ /*
+ * check if PMU is usable
+ */
+ if (!(ctx->regs.num_pmcs && ctx->regs.num_pmcs)) {
+ PFM_DBG("no usable PMU registers");
+ ret = -EBUSY;
+ goto error_file;
+ }
+
+ ret = -ENFILE;
+ fd = pfm_alloc_fd(&filp);
+ if (fd < 0)
+ goto error_file;
+
+ /*
+ * initialize arch-specific section
+ * must be done before fmt_init()
+ */
+ ret = pfm_arch_context_create(ctx, ctx_flags);
+ if (ret)
+ goto error_set;
+
+ ret = -ENOMEM;
+
+ /*
+ * add initial set
+ */
+ pfm_create_initial_set(ctx);
+
+ filp->private_data = ctx;
+
+ ctx->last_act = PFM_INVALID_ACTIVATION;
+ ctx->last_cpu = -1;
+
+ PFM_DBG("flags=0x%x fd=%d", ctx_flags, fd);
+
+ if (new_ctx)
+ *new_ctx = ctx;
+
+ /*
+ * copy bitmask of available PMU registers
+ *
+ * must copy over the entire vector to avoid
+ * returning bogus upper bits pass by user
+ */
+ pfm_arch_bv_copy(sif->sif_avail_pmcs,
+ ctx->regs.pmcs,
+ PFM_MAX_PMCS);
+
+ pfm_arch_bv_copy(sif->sif_avail_pmds,
+ ctx->regs.pmds,
+ PFM_MAX_PMDS);
+
+ /*
+ * we defer the fd_install until we are certain the call succeeded
+ * to ensure we do not have to undo its effect. Neither put_filp()
+ * nor put_unused_fd() undoes the effect of fd_install().
+ */
+ fd_install(fd, filp);
+
+ return fd;
+
+error_set:
+ put_filp(filp);
+ put_unused_fd(fd);
+error_file:
+ /*
+ * calls the right *_put() functions
+ * calls pfm_release_pmu()
+ */
+ pfm_free_context(ctx);
+ return ret;
+error_alloc:
+ return ret;
+}
+
+/**
+ * pfm_undo_create -- undo context creation
+ * @fd: file descriptor to close
+ * @ctx: newly created context
+ *
+ * upon return neither fd nor ctx are useable
+ */
+void pfm_undo_create(int fd, struct pfm_context *ctx)
+{
+ struct files_struct *files = current->files;
+ struct file *file;
+ int fput_needed;
+
+ file = fget_light(fd, &fput_needed);
+ /*
+ * there is no fd_uninstall(), so we do it
+ * here. put_unused_fd() does not remove the
+ * effect of fd_install().
+ */
+
+ spin_lock(&files->file_lock);
+ files->fd_array[fd] = NULL;
+ spin_unlock(&files->file_lock);
+
+ fput_light(file, fput_needed);
+
+ /*
+ * decrement ref count and kill file
+ */
+ put_filp(file);
+
+ put_unused_fd(fd);
+
+ pfm_free_context(ctx);
+}
diff --git a/perfmon/perfmon_ctxsw.c b/perfmon/perfmon_ctxsw.c
new file mode 100644
index 000000000000..b1086f6dca31
--- /dev/null
+++ b/perfmon/perfmon_ctxsw.c
@@ -0,0 +1,252 @@
+/*
+ * perfmon_cxtsw.c: perfmon2 context switch code
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@gmail.com>
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * http://perfmon2.sf.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/perfmon_kern.h>
+#include "perfmon_priv.h"
+
+void pfm_save_pmds(struct pfm_context *ctx)
+{
+ struct pfm_event_set *set;
+ u64 val, ovfl_mask;
+ u64 *used_pmds, *cnt_pmds;
+ u16 i, num;
+
+ set = ctx->active_set;
+ ovfl_mask = pfm_pmu_conf->ovfl_mask;
+ num = set->nused_pmds;
+ cnt_pmds = ctx->regs.cnt_pmds;
+ used_pmds = set->used_pmds;
+
+ /*
+ * save HW PMD, for counters, reconstruct 64-bit value
+ */
+ for (i = 0; num; i++) {
+ if (pfm_arch_bv_test_bit(i, used_pmds)) {
+ val = pfm_read_pmd(ctx, i);
+ if (likely(pfm_arch_bv_test_bit(i, cnt_pmds)))
+ val = (set->pmds[i] & ~ovfl_mask) |
+ (val & ovfl_mask);
+ set->pmds[i] = val;
+ num--;
+ }
+ }
+}
+
+/*
+ * interrupts are disabled (no preemption)
+ */
+void __pfm_ctxswin_thread(struct task_struct *task,
+ struct pfm_context *ctx)
+{
+ u64 cur_act;
+ struct pfm_event_set *set;
+ int reload_pmcs, reload_pmds;
+ int mycpu, is_active;
+
+ mycpu = smp_processor_id();
+
+ cur_act = __get_cpu_var(pmu_activation_number);
+ /*
+ * we need to lock context because it could be accessed
+ * from another CPU. Normally the schedule() functions
+ * has masked interrupts which should be enough to
+ * protect against PMU interrupts.
+ */
+ spin_lock(&ctx->lock);
+
+ is_active = pfm_arch_is_active(ctx);
+
+ set = ctx->active_set;
+
+ /*
+ * in case fo zombie, we do not complete ctswin of the
+ * PMU, and we force a call to pfm_handle_work() to finish
+ * cleanup, i.e., free context + smpl_buff. The reason for
+ * deferring to pfm_handle_work() is that it is not possible
+ * to vfree() with interrupts disabled.
+ */
+ if (unlikely(ctx->state == PFM_CTX_ZOMBIE)) {
+ pfm_post_work(task, ctx, PFM_WORK_ZOMBIE);
+ goto done;
+ }
+
+ /*
+ * if we were the last user of the PMU on that CPU,
+ * then nothing to do except restore psr
+ */
+ if (ctx->last_cpu == mycpu && ctx->last_act == cur_act) {
+ /*
+ * check for forced reload conditions
+ */
+ reload_pmcs = set->priv_flags & PFM_SETFL_PRIV_MOD_PMCS;
+ reload_pmds = set->priv_flags & PFM_SETFL_PRIV_MOD_PMDS;
+ } else {
+#ifndef CONFIG_SMP
+ pfm_check_save_prev_ctx();
+#endif
+ reload_pmcs = 1;
+ reload_pmds = 1;
+ }
+ /* consumed */
+ set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH;
+
+ if (reload_pmds)
+ pfm_arch_restore_pmds(ctx, set);
+
+ /*
+ * need to check if had in-flight interrupt in
+ * pfm_ctxswout_thread(). If at least one bit set, then we must replay
+ * the interrupt to avoid losing some important performance data.
+ *
+ * npend_ovfls is cleared in interrupt handler
+ */
+ if (set->npend_ovfls)
+ pfm_arch_resend_irq(ctx);
+
+ if (reload_pmcs)
+ pfm_arch_restore_pmcs(ctx, set);
+
+ /*
+ * record current activation for this context
+ */
+ __get_cpu_var(pmu_activation_number)++;
+ ctx->last_cpu = mycpu;
+ ctx->last_act = __get_cpu_var(pmu_activation_number);
+
+ /*
+ * establish new ownership.
+ */
+ pfm_set_pmu_owner(task, ctx);
+
+ pfm_arch_ctxswin_thread(task, ctx);
+done:
+ spin_unlock(&ctx->lock);
+}
+
+/*
+ * interrupts are masked, runqueue lock is held.
+ *
+ * In UP. we simply stop monitoring and leave the state
+ * in place, i.e., lazy save
+ */
+void __pfm_ctxswout_thread(struct task_struct *task,
+ struct pfm_context *ctx)
+{
+ int need_save_pmds, is_active;
+
+ /*
+ * we need to lock context because it could be accessed
+ * from another CPU. Normally the schedule() functions
+ * has masked interrupts which should be enough to
+ * protect against PMU interrupts.
+ */
+
+ spin_lock(&ctx->lock);
+
+ is_active = pfm_arch_is_active(ctx);
+
+ /*
+ * stop monitoring and
+ * collect pending overflow information
+ * needed on ctxswin. We cannot afford to lose
+ * a PMU interrupt.
+ */
+ need_save_pmds = pfm_arch_ctxswout_thread(task, ctx);
+
+#ifdef CONFIG_SMP
+ /*
+ * in SMP, release ownership of this PMU.
+ * PMU interrupts are masked, so nothing
+ * can happen.
+ */
+ pfm_set_pmu_owner(NULL, NULL);
+
+ /*
+ * On some architectures, it is necessary to read the
+ * PMD registers to check for pending overflow in
+ * pfm_arch_ctxswout_thread(). In that case, saving of
+ * the PMDs may be done there and not here.
+ */
+ if (need_save_pmds)
+ pfm_save_pmds(ctx);
+#endif
+ spin_unlock(&ctx->lock);
+}
+
+/**
+ * pfm_ctxsw_out - save PMU state on context switch out
+ * @prev: thread being switched out
+ * @next: thread being switched in
+ *
+ * We pass the next thread as on some platforms it may be necessary to
+ * pass some settings from the current thread to the next
+ *
+ * Interrupts are masked
+ */
+void pfm_ctxsw_out(struct task_struct *prev,
+ struct task_struct *next)
+{
+ struct pfm_context *ctxp;
+
+ ctxp = prev->pfm_context;
+
+ if (ctxp)
+ __pfm_ctxswout_thread(prev, ctxp);
+}
+
+/**
+ * pfm_ctxsw_in - restore PMU state on context switch in
+ * @prev: thread being switched out
+ * @next: thread being switched in
+ *
+ * We pass the prev thread as on some platforms it may be necessary to
+ * pass some settings from the current thread to the next
+ *
+ * Interrupts are masked
+ */
+void pfm_ctxsw_in(struct task_struct *prev,
+ struct task_struct *next)
+{
+ struct pfm_context *ctxn;
+
+ ctxn = next->pfm_context;
+
+ if (ctxn)
+ __pfm_ctxswin_thread(next, ctxn);
+
+}
diff --git a/perfmon/perfmon_file.c b/perfmon/perfmon_file.c
new file mode 100644
index 000000000000..12ec6b7bea73
--- /dev/null
+++ b/perfmon/perfmon_file.c
@@ -0,0 +1,306 @@
+/*
+ * perfmon_file.c: perfmon2 file input/output functions
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * http://perfmon2.sf.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/vfs.h>
+#include <linux/mount.h>
+#include <linux/perfmon_kern.h>
+#include "perfmon_priv.h"
+
+#define PFMFS_MAGIC 0xa0b4d889 /* perfmon filesystem magic number */
+
+struct pfm_controls pfm_controls = {
+ .task_group = PFM_GROUP_PERM_ANY,
+ .arg_mem_max = PAGE_SIZE,
+};
+
+static int __init enable_debug(char *str)
+{
+ pfm_controls.debug = 1;
+ PFM_INFO("debug output enabled\n");
+ return 1;
+}
+__setup("perfmon_debug", enable_debug);
+
+static int pfmfs_get_sb(struct file_system_type *fs_type,
+ int flags, const char *dev_name,
+ void *data, struct vfsmount *mnt)
+{
+ return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC, mnt);
+}
+
+static struct file_system_type pfm_fs_type = {
+ .name = "pfmfs",
+ .get_sb = pfmfs_get_sb,
+ .kill_sb = kill_anon_super,
+};
+
+/*
+ * pfmfs should _never_ be mounted by userland - too much of security hassle,
+ * no real gain from having the whole whorehouse mounted. So we don't need
+ * any operations on the root directory. However, we need a non-trivial
+ * d_name - pfm: will go nicely and kill the special-casing in procfs.
+ */
+static struct vfsmount *pfmfs_mnt;
+
+int __init pfm_init_fs(void)
+{
+ int err = register_filesystem(&pfm_fs_type);
+ if (!err) {
+ pfmfs_mnt = kern_mount(&pfm_fs_type);
+ err = PTR_ERR(pfmfs_mnt);
+ if (IS_ERR(pfmfs_mnt))
+ unregister_filesystem(&pfm_fs_type);
+ else
+ err = 0;
+ }
+ return err;
+}
+
+/*
+ * called either on explicit close() or from exit_files().
+ * Only the LAST user of the file gets to this point, i.e., it is
+ * called only ONCE.
+ *
+ * IMPORTANT: we get called ONLY when the refcnt on the file gets to zero
+ * (fput()),i.e, last task to access the file. Nobody else can access the
+ * file at this point.
+ *
+ * When called from exit_files(), the VMA has been freed because exit_mm()
+ * is executed before exit_files().
+ *
+ * When called from exit_files(), the current task is not yet ZOMBIE but we
+ * flush the PMU state to the context.
+ */
+static int __pfm_close(struct pfm_context *ctx, struct file *filp)
+{
+ unsigned long flags;
+ int state;
+ int can_free = 1, can_unload = 1;
+ int can_release = 0;
+
+ spin_lock_irqsave(&ctx->lock, flags);
+
+ state = ctx->state;
+
+ PFM_DBG("state=%d", state);
+
+ /*
+ * check if unload is needed
+ */
+ if (state == PFM_CTX_UNLOADED)
+ goto doit;
+
+#ifdef CONFIG_SMP
+ if (ctx->task != current) {
+ /*
+ * switch context to zombie state
+ */
+ ctx->state = PFM_CTX_ZOMBIE;
+
+ PFM_DBG("zombie ctx for [%d]", ctx->task->pid);
+ /*
+ * PMU session will be released by monitored task when
+ * it notices ZOMBIE state as part of pfm_unload_context()
+ */
+ can_unload = can_free = 0;
+ }
+#endif
+ if (can_unload)
+ can_release = !__pfm_unload_context(ctx);
+doit:
+ spin_unlock_irqrestore(&ctx->lock, flags);
+
+ if (can_release)
+ pfm_session_release();
+
+ if (can_free)
+ pfm_free_context(ctx);
+
+ return 0;
+}
+
+/*
+ * called either on explicit close() or from exit_files().
+ * Only the LAST user of the file gets to this point, i.e., it is
+ * called only ONCE.
+ *
+ * IMPORTANT: we get called ONLY when the refcnt on the file gets to zero
+ * (fput()),i.e, last task to access the file. Nobody else can access the
+ * file at this point.
+ *
+ * When called from exit_files(), the VMA has been freed because exit_mm()
+ * is executed before exit_files().
+ *
+ * When called from exit_files(), the current task is not yet ZOMBIE but we
+ * flush the PMU state to the context.
+ */
+static int pfm_close(struct inode *inode, struct file *filp)
+{
+ struct pfm_context *ctx;
+
+ PFM_DBG("called filp=%p", filp);
+
+ ctx = filp->private_data;
+ if (ctx == NULL) {
+ PFM_ERR("no ctx");
+ return -EBADF;
+ }
+ return __pfm_close(ctx, filp);
+}
+
+static int pfm_no_open(struct inode *irrelevant, struct file *dontcare)
+{
+ PFM_DBG("pfm_file_ops");
+
+ return -ENXIO;
+}
+
+static unsigned int pfm_no_poll(struct file *filp, poll_table *wait)
+{
+ return 0;
+}
+
+static ssize_t pfm_read(struct file *filp, char __user *buf, size_t size,
+ loff_t *ppos)
+{
+ PFM_DBG("pfm_read called");
+ return -EINVAL;
+}
+
+static ssize_t pfm_write(struct file *file, const char __user *ubuf,
+ size_t size, loff_t *ppos)
+{
+ PFM_DBG("pfm_write called");
+ return -EINVAL;
+}
+
+static int pfm_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ PFM_DBG("pfm_ioctl called");
+ return -EINVAL;
+}
+
+const struct file_operations pfm_file_ops = {
+ .llseek = no_llseek,
+ .read = pfm_read,
+ .write = pfm_write,
+ .ioctl = pfm_ioctl,
+ .open = pfm_no_open, /* special open to disallow open via /proc */
+ .release = pfm_close,
+ .poll = pfm_no_poll,
+};
+
+static int pfmfs_delete_dentry(struct dentry *dentry)
+{
+ return 1;
+}
+
+static struct dentry_operations pfmfs_dentry_operations = {
+ .d_delete = pfmfs_delete_dentry,
+};
+
+int pfm_alloc_fd(struct file **cfile)
+{
+ int fd, ret = 0;
+ struct file *file = NULL;
+ struct inode * inode;
+ char name[32];
+ struct qstr this;
+
+ fd = get_unused_fd();
+ if (fd < 0)
+ return -ENFILE;
+
+ ret = -ENFILE;
+
+ file = get_empty_filp();
+ if (!file)
+ goto out;
+
+ /*
+ * allocate a new inode
+ */
+ inode = new_inode(pfmfs_mnt->mnt_sb);
+ if (!inode)
+ goto out;
+
+ PFM_DBG("new inode ino=%ld @%p", inode->i_ino, inode);
+
+ inode->i_sb = pfmfs_mnt->mnt_sb;
+ inode->i_mode = S_IFCHR|S_IRUGO;
+ inode->i_uid = current_fsuid();
+ inode->i_gid = current_fsgid();
+
+ sprintf(name, "[%lu]", inode->i_ino);
+ this.name = name;
+ this.hash = inode->i_ino;
+ this.len = strlen(name);
+
+ ret = -ENOMEM;
+
+ /*
+ * allocate a new dcache entry
+ */
+ file->f_dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this);
+ if (!file->f_dentry)
+ goto out;
+
+ file->f_dentry->d_op = &pfmfs_dentry_operations;
+
+ d_add(file->f_dentry, inode);
+ file->f_vfsmnt = mntget(pfmfs_mnt);
+ file->f_mapping = inode->i_mapping;
+
+ file->f_op = &pfm_file_ops;
+ file->f_mode = FMODE_READ;
+ file->f_flags = O_RDONLY;
+ file->f_pos = 0;
+
+ *cfile = file;
+
+ return fd;
+out:
+ if (file)
+ put_filp(file);
+ put_unused_fd(fd);
+ return ret;
+}
diff --git a/perfmon/perfmon_init.c b/perfmon/perfmon_init.c
new file mode 100644
index 000000000000..a92126d1687c
--- /dev/null
+++ b/perfmon/perfmon_init.c
@@ -0,0 +1,87 @@
+/*
+ * perfmon.c: perfmon2 global initialization functions
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * http://www.hpl.hp.com/research/linux/perfmon
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/perfmon_kern.h>
+#include "perfmon_priv.h"
+
+/*
+ * external variables
+ */
+DEFINE_PER_CPU(struct task_struct *, pmu_owner);
+DEFINE_PER_CPU(struct pfm_context *, pmu_ctx);
+DEFINE_PER_CPU(u64, pmu_activation_number);
+
+int perfmon_disabled; /* >0 if perfmon is disabled */
+
+/*
+ * global initialization routine, executed only once
+ */
+int __init pfm_init(void)
+{
+ PFM_LOG("version %u.%u", PFM_VERSION_MAJ, PFM_VERSION_MIN);
+
+ if (pfm_init_ctx())
+ goto error_disable;
+
+ if (pfm_init_fs())
+ goto error_disable;
+
+ if (pfm_init_sysfs())
+ goto error_disable;
+
+ /*
+ * one time, arch-specific global initialization
+ */
+ if (pfm_arch_init())
+ goto error_disable;
+
+ return 0;
+
+error_disable:
+ PFM_ERR("perfmon is disabled due to initialization error");
+ perfmon_disabled = 1;
+ return -1;
+}
+
+/*
+ * must use subsys_initcall() to ensure that the perfmon2 core
+ * is initialized before any PMU description module when they are
+ * compiled in.
+ */
+subsys_initcall(pfm_init);
diff --git a/perfmon/perfmon_intr.c b/perfmon/perfmon_intr.c
new file mode 100644
index 000000000000..d9e87bb11aa2
--- /dev/null
+++ b/perfmon/perfmon_intr.c
@@ -0,0 +1,295 @@
+/*
+ * perfmon_intr.c: perfmon2 interrupt handling
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * http://perfmon2.sf.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/perfmon_kern.h>
+#include "perfmon_priv.h"
+
+/**
+ * pfm_intr_process_64bit_ovfls - handle 64-bit counter emulation
+ * @ctx: context to operate on
+ * @set: set to operate on
+ *
+ * The function returns the number of 64-bit overflows detected.
+ *
+ * 64-bit software pmds are updated for overflowed pmd registers
+ *
+ * In any case, set->npend_ovfls is cleared
+ */
+static u16 pfm_intr_process_64bit_ovfls(struct pfm_context *ctx,
+ struct pfm_event_set *set)
+{
+ u16 i, num_ovfls, max_pmd, max_intr;
+ u16 num_64b_ovfls;
+ u64 old_val, new_val, ovfl_mask;
+
+ num_64b_ovfls = 0;
+
+ ovfl_mask = pfm_pmu_conf->ovfl_mask;
+ max_pmd = ctx->regs.max_pmd;
+ max_intr = ctx->regs.max_intr_pmd;
+
+ num_ovfls = set->npend_ovfls;
+
+ for (i = 0; num_ovfls; i++) {
+ /*
+ * skip pmd which did not overflow
+ */
+ if (!pfm_arch_bv_test_bit(i, set->povfl_pmds))
+ continue;
+
+ num_ovfls--;
+
+ /*
+ * Update software value for counters ONLY
+ *
+ * Note that the pmd is not necessarily 0 at this point as
+ * qualified events may have happened before the PMU was
+ * frozen. The residual count is not taken into consideration
+ * here but will be with any read of the pmd
+ */
+ if (likely(pfm_arch_bv_test_bit(i, ctx->regs.cnt_pmds))) {
+ old_val = new_val = set->pmds[i];
+ new_val += 1 + ovfl_mask;
+ set->pmds[i] = new_val;
+ } else {
+ /*
+ * for non counters which interrupt, e.g., AMD IBS,
+ * we consider this equivalent to a 64-bit counter
+ * overflow.
+ */
+ old_val = 1; new_val = 0;
+ }
+
+ /*
+ * check for 64-bit overflow condition
+ */
+ if (likely(old_val > new_val)) {
+ num_64b_ovfls++;
+ } else {
+ /*
+ * on some PMU, it may be necessary to re-arm the PMD
+ */
+ pfm_arch_ovfl_reset_pmd(ctx, i);
+ }
+
+ PFM_DBG_ovfl("pmd%u ovfl=%s new=0x%llx old=0x%llx "
+ "hw_pmd=0x%llx",
+ i,
+ old_val > new_val ? "64-bit" : "HW",
+ (unsigned long long)new_val,
+ (unsigned long long)old_val,
+ (unsigned long long)pfm_read_pmd(ctx, i));
+ }
+ /*
+ * mark the overflows as consumed
+ */
+ set->npend_ovfls = 0;
+ pfm_arch_bv_zero(set->povfl_pmds, max_intr);
+
+ return num_64b_ovfls;
+}
+
+/**
+ * pfm_overflow_handler - main overflow processing routine.
+ * @ctx: context to work on (always current context)
+ * @set: current event set
+ * @ip: interrupt instruction pointer
+ * @regs: machine state
+ */
+static void pfm_overflow_handler(struct pfm_context *ctx,
+ struct pfm_event_set *set,
+ unsigned long ip,
+ struct pt_regs *regs)
+{
+ /*
+ * skip ZOMBIE case
+ */
+ if (unlikely(ctx->state == PFM_CTX_ZOMBIE))
+ goto stop_monitoring;
+
+ PFM_DBG_ovfl("intr_pmds=0x%llx npend=%u ip=%p u_pmds=0x%llx",
+ (unsigned long long)set->povfl_pmds[0],
+ set->npend_ovfls,
+ (void *)ip,
+ (unsigned long long)set->used_pmds[0]);
+
+ /*
+ * return number of 64-bit overflows
+ */
+ pfm_intr_process_64bit_ovfls(ctx, set);
+
+ return;
+
+stop_monitoring:
+ /*
+ * Does not happen for a self-monitored context.
+ * We cannot attach to kernel-only thread, thus it is safe to
+ * set TIF bits, i.e., the thread will eventually leave the kernel
+ * or die and either we will catch the context and clean it up in
+ * pfm_handler_work() or pfm_exit_thread().
+ *
+ * Mask until we get to pfm_handle_work()
+ * pfm_mask_monitoring(ctx, set);
+ */
+ PFM_DBG_ovfl("ctx is zombie, converted to spurious");
+ pfm_post_work(current, ctx, PFM_WORK_ZOMBIE);
+}
+
+/**
+ * __pfm_interrupt_handler - 1st level interrupt handler
+ * @ip: interrupted instruction pointer
+ * @regs: machine state
+ *
+ * Function is static because we use a wrapper to easily capture timing infos.
+ *
+ * Context locking necessary to avoid concurrent accesses from other CPUs
+ */
+static void __pfm_interrupt_handler(unsigned long ip, struct pt_regs *regs)
+{
+ struct task_struct *task;
+ struct pfm_context *ctx;
+ struct pfm_event_set *set;
+
+
+ task = __get_cpu_var(pmu_owner);
+ ctx = __get_cpu_var(pmu_ctx);
+
+ /*
+ * verify if there is a context on this CPU
+ */
+ if (unlikely(ctx == NULL)) {
+ PFM_DBG_ovfl("no ctx");
+ goto spurious;
+ }
+
+ /*
+ * we need to lock context because it could be accessed
+ * from another CPU. Depending on the priority level of
+ * the PMU interrupt or the arch, it may be necessary to
+ * mask interrupts alltogether to avoid race condition with
+ * the timer interrupt in case of time-based set switching,
+ * for instance.
+ */
+ spin_lock(&ctx->lock);
+
+ set = ctx->active_set;
+
+ /*
+ * For SMP per-thread, it is not possible to have
+ * owner != NULL && task != current.
+ *
+ * For UP per-thread, because of lazy save, it
+ * is possible to receive an interrupt in another task
+ * which is not using the PMU. This means
+ * that the interrupt was in-flight at the
+ * time of pfm_ctxswout_thread(). In that
+ * case, it will be replayed when the task
+ * is scheduled again. Hence we convert to spurious.
+ *
+ * The basic rule is that an overflow is always
+ * processed in the context of the task that
+ * generated it for all per-thread contexts.
+ */
+#ifndef CONFIG_SMP
+ if (unlikely((task && current->pfm_context != ctx))) {
+ PFM_DBG_ovfl("spurious: not owned by current task");
+ goto spurious;
+ }
+#endif
+ /*
+ * check that monitoring is active, otherwise convert
+ * to spurious
+ */
+ if (unlikely(!pfm_arch_is_active(ctx))) {
+ PFM_DBG_ovfl("spurious: monitoring non active");
+ goto spurious;
+ }
+
+ /*
+ * freeze PMU and collect overflowed PMD registers
+ * into set->povfl_pmds. Number of overflowed PMDs
+ * reported in set->npend_ovfls
+ */
+ pfm_arch_intr_freeze_pmu(ctx, set);
+
+ /*
+ * no overflow detected, interrupt may have come
+ * from the previous thread running on this CPU
+ */
+ if (unlikely(!set->npend_ovfls)) {
+ PFM_DBG_ovfl("no npend_ovfls");
+ goto spurious;
+ }
+
+ /*
+ * invoke actual handler
+ */
+ pfm_overflow_handler(ctx, set, ip, regs);
+
+ /*
+ * unfreeze PMU
+ */
+ pfm_arch_intr_unfreeze_pmu(ctx);
+
+ spin_unlock(&ctx->lock);
+
+ return;
+
+spurious:
+ /* ctx may be NULL */
+ pfm_arch_intr_unfreeze_pmu(ctx);
+ if (ctx)
+ spin_unlock(&ctx->lock);
+}
+
+
+/**
+ * pfm_interrupt_handler - 1st level interrupt handler
+ * @ip: interrupt instruction pointer
+ * @regs: machine state
+ *
+ * Function called from the low-level assembly code or arch-specific perfmon
+ * code. Simple wrapper used for timing purpose. Actual work done in
+ * __pfm_overflow_handler()
+ */
+void pfm_interrupt_handler(unsigned long ip, struct pt_regs *regs)
+{
+ BUG_ON(!irqs_disabled());
+ __pfm_interrupt_handler(ip, regs);
+}
diff --git a/perfmon/perfmon_pmu.c b/perfmon/perfmon_pmu.c
new file mode 100644
index 000000000000..0e44ee8530a6
--- /dev/null
+++ b/perfmon/perfmon_pmu.c
@@ -0,0 +1,269 @@
+/*
+ * perfmon_pmu.c: perfmon2 PMU configuration management
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * http://perfmon2.sf.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/module.h>
+#include <linux/perfmon_kern.h>
+#include "perfmon_priv.h"
+
+#ifndef CONFIG_MODULE_UNLOAD
+#define module_refcount(n) 1
+#endif
+
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_pmu_conf_lock);
+
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_pmu_acq_lock);
+static u32 pfm_pmu_acquired;
+
+/*
+ * perfmon core must acces PMU information ONLY through pfm_pmu_conf
+ * if pfm_pmu_conf is NULL, then no description is registered
+ */
+struct pfm_pmu_config *pfm_pmu_conf;
+EXPORT_SYMBOL(pfm_pmu_conf);
+
+/**
+ * pfm_pmu_regdesc_init -- initialize regdesc structure from PMU table
+ * @regs: the regdesc structure to initialize
+ * @excl_type: the register type(s) to exclude from this regdesc
+ * @unvail_pmcs: unavailable PMC registers
+ * @unavail_pmds: unavailable PMD registers
+ */
+static void pfm_pmu_regdesc_init(struct pfm_regdesc *regs, int excl_type,
+ u64 *unavail_pmcs, u64 *unavail_pmds)
+{
+ struct pfm_regmap_desc *d;
+ u16 n, n2, n_counters, i;
+ int max1, max2, max3;
+
+ /*
+ * compute the number of implemented PMC from the
+ * description table
+ */
+ n = 0;
+ max1 = max2 = -1;
+ d = pfm_pmu_conf->pmc_desc;
+ for (i = 0; i < pfm_pmu_conf->num_pmc_entries; i++, d++) {
+ if (!(d->type & PFM_REG_I))
+ continue;
+
+ if (pfm_arch_bv_test_bit(i, unavail_pmcs))
+ continue;
+
+ if (d->type & excl_type)
+ continue;
+
+ pfm_arch_bv_set_bit(i, regs->pmcs);
+
+ max1 = i;
+ n++;
+ }
+
+ regs->max_pmc = max1 + 1;
+ regs->num_pmcs = n;
+
+ n = n_counters = n2 = 0;
+ max1 = max2 = max3 = -1;
+ d = pfm_pmu_conf->pmd_desc;
+ for (i = 0; i < pfm_pmu_conf->num_pmd_entries; i++, d++) {
+ if (!(d->type & PFM_REG_I))
+ continue;
+
+ if (pfm_arch_bv_test_bit(i, unavail_pmds))
+ continue;
+
+ if (d->type & excl_type)
+ continue;
+
+ pfm_arch_bv_set_bit(i, regs->pmds);
+ max1 = i;
+ n++;
+
+ /*
+ * read-write registers
+ */
+ if (!(d->type & PFM_REG_RO)) {
+ pfm_arch_bv_set_bit(i, regs->rw_pmds);
+ max3 = i;
+ n2++;
+ }
+
+ /*
+ * counter registers
+ */
+ if (d->type & PFM_REG_C64) {
+ pfm_arch_bv_set_bit(i, regs->cnt_pmds);
+ n_counters++;
+ }
+
+ /*
+ * PMD with intr capabilities
+ */
+ if (d->type & PFM_REG_INTR) {
+ pfm_arch_bv_set_bit(i, regs->intr_pmds);
+ max2 = i;
+ }
+ }
+
+ regs->max_pmd = max1 + 1;
+ regs->max_intr_pmd = max2 + 1;
+
+ regs->num_counters = n_counters;
+ regs->num_pmds = n;
+ regs->max_rw_pmd = max3 + 1;
+ regs->num_rw_pmd = n2;
+}
+
+int pfm_pmu_register(struct pfm_pmu_config *cfg)
+{
+ int ret = -EBUSY;
+
+ if (perfmon_disabled) {
+ PFM_INFO("perfmon disabled, cannot add PMU description");
+ return -ENOSYS;
+ }
+
+ spin_lock(&pfm_pmu_conf_lock);
+
+ if (pfm_pmu_conf)
+ goto unlock;
+
+ pfm_pmu_conf = cfg;
+ pfm_pmu_conf->ovfl_mask = (1ULL << cfg->counter_width) - 1;
+
+ ret = pfm_sysfs_add_pmu(pfm_pmu_conf);
+ if (ret)
+ pfm_pmu_conf = NULL;
+
+unlock:
+ spin_unlock(&pfm_pmu_conf_lock);
+
+ if (ret)
+ PFM_INFO("register %s PMU error %d", cfg->pmu_name, ret);
+ else
+ PFM_INFO("%s PMU installed", cfg->pmu_name);
+ return ret;
+}
+
+/*
+ * acquire PMU resource from lower-level PMU register allocator
+ * (currently perfctr-watchdog.c)
+ *
+ * acquisition is done when the first context is created (and not
+ * when it is loaded). We grab all that is defined in the description
+ * module and then we make adjustments at the arch-specific level.
+ *
+ * The PMU resource is released when the last perfmon context is
+ * destroyed.
+ *
+ * interrupts are not masked
+ */
+int pfm_pmu_acquire(struct pfm_context *ctx)
+{
+ u64 unavail_pmcs[PFM_PMC_BV];
+ u64 unavail_pmds[PFM_PMD_BV];
+ int ret = 0;
+
+ spin_lock(&pfm_pmu_acq_lock);
+
+ PFM_DBG("pmu_acquired=%d", pfm_pmu_acquired);
+
+ pfm_pmu_acquired++;
+
+ if (pfm_pmu_acquired == 1) {
+
+ memset(unavail_pmcs, 0, sizeof(unavail_pmcs));
+ memset(unavail_pmds, 0, sizeof(unavail_pmds));
+
+ ret = pfm_arch_pmu_acquire(unavail_pmcs, unavail_pmds);
+ if (ret) {
+ pfm_pmu_acquired--;
+ } else {
+ memset(&pfm_pmu_conf->regs_all, 0, sizeof(struct pfm_regdesc));
+
+ pfm_pmu_regdesc_init(&pfm_pmu_conf->regs_all, 0,
+ unavail_pmcs,
+ unavail_pmds);
+
+ PFM_DBG("regs_all.pmcs=0x%llx",
+ (unsigned long long)pfm_pmu_conf->regs_all.pmcs[0]);
+
+ /* available PMU ressources */
+ PFM_DBG("PMU acquired: %u PMCs, %u PMDs, %u counters",
+ pfm_pmu_conf->regs_all.num_pmcs,
+ pfm_pmu_conf->regs_all.num_pmds,
+ pfm_pmu_conf->regs_all.num_counters);
+ }
+ }
+ spin_unlock(&pfm_pmu_acq_lock);
+ /*
+ * copy global regdesc to context (for future extensions)
+ */
+ ctx->regs = pfm_pmu_conf->regs_all;
+
+ return ret;
+}
+
+/*
+ * release the PMU resource
+ *
+ * actual release happens when last context is destroyed
+ *
+ * interrupts are not masked
+ */
+void pfm_pmu_release(void)
+{
+ BUG_ON(irqs_disabled());
+
+ /*
+ * we need to use a spinlock because release takes some time
+ * and we may have a race with pfm_pmu_acquire()
+ */
+ spin_lock(&pfm_pmu_acq_lock);
+
+ PFM_DBG("pmu_acquired=%d", pfm_pmu_acquired);
+
+ /*
+ * we decouple test and decrement because if we had errors
+ * in pfm_pmu_acquire(), we still come here on pfm_context_free()
+ * but with pfm_pmu_acquire=0
+ */
+ if (pfm_pmu_acquired > 0 && --pfm_pmu_acquired == 0) {
+ pfm_arch_pmu_release();
+ PFM_DBG("PMU released");
+ }
+ spin_unlock(&pfm_pmu_acq_lock);
+}
diff --git a/perfmon/perfmon_priv.h b/perfmon/perfmon_priv.h
new file mode 100644
index 000000000000..f1068e5ff308
--- /dev/null
+++ b/perfmon/perfmon_priv.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+
+#ifndef __PERFMON_PRIV_H__
+#define __PERFMON_PRIV_H__
+/*
+ * This file contains all the definitions of data structures, variables, macros
+ * that are to private to the generic code, i.e., not shared with any code that
+ * lives under arch/ or include/asm-XX
+ *
+ * For shared definitions, use include/linux/perfmon_kern.h
+ */
+
+#ifdef CONFIG_PERFMON
+
+/*
+ * context lazy save/restore activation count
+ */
+#define PFM_INVALID_ACTIVATION ((u64)~0)
+
+DECLARE_PER_CPU(u64, pmu_activation_number);
+
+static inline void pfm_set_pmu_owner(struct task_struct *task,
+ struct pfm_context *ctx)
+{
+ __get_cpu_var(pmu_owner) = task;
+ __get_cpu_var(pmu_ctx) = ctx;
+}
+
+int pfm_init_ctx(void);
+int __pfm_write_pmcs(struct pfm_context *ctx, struct pfarg_pmr *req,
+ int count);
+int __pfm_write_pmds(struct pfm_context *ctx, struct pfarg_pmr *req,
+ int count);
+int __pfm_read_pmds(struct pfm_context *ctx, struct pfarg_pmr *req, int count);
+
+int pfm_session_acquire(void);
+void pfm_session_release(void);
+
+int pfm_init_sysfs(void);
+
+int __pfm_create_context(__u32 ctx_flags, struct pfarg_sinfo *sif,
+ struct pfm_context **new_ctx);
+void pfm_free_context(struct pfm_context *ctx);
+void pfm_undo_create(int fd, struct pfm_context *ctx);
+
+int __pfm_stop(struct pfm_context *ctx);
+int __pfm_start(struct pfm_context *ctx);
+
+int __pfm_load_context(struct pfm_context *ctx, struct task_struct *task);
+int __pfm_unload_context(struct pfm_context *ctx);
+
+int pfm_alloc_fd(struct file **cfile);
+
+ssize_t pfm_sysfs_res_show(char *buf, size_t sz, int what);
+
+int pfm_pmu_acquire(struct pfm_context *ctx);
+void pfm_pmu_release(void);
+
+void pfm_save_pmds(struct pfm_context *ctx);
+
+/*
+ * check_mask bitmask values for pfm_check_task_state()
+ */
+#define PFM_CMD_STOPPED 0x01 /* command needs thread stopped */
+#define PFM_CMD_UNLOADED 0x02 /* command needs ctx unloaded */
+#define PFM_CMD_UNLOAD 0x04 /* command is unload */
+
+/**
+ * pfm_save_prev_ctx - check if previous context exists and save state
+ *
+ * called from pfm_load_ctx_thread() and __pfm_ctxsin_thread() to
+ * check if previous context exists. If so saved its PMU state. This is used
+ * only for UP kernels.
+ *
+ * PMU ownership is not cleared because the function is always called while
+ * trying to install a new owner.
+ */
+static inline void pfm_check_save_prev_ctx(void)
+{
+#ifdef CONFIG_SMP
+ struct pfm_context *ctxp;
+
+ ctxp = __get_cpu_var(pmu_ctx);
+ if (!ctxp)
+ return;
+ /*
+ * in UP per-thread, due to lazy save
+ * there could be a context from another
+ * task. We need to push it first before
+ * installing our new state
+ */
+ pfm_save_pmds(ctxp);
+ /*
+ * do not clear ownership because we rewrite
+ * right away
+ */
+#endif
+}
+
+int pfm_init_fs(void);
+
+static inline void pfm_post_work(struct task_struct *task,
+ struct pfm_context *ctx, int type)
+{
+ ctx->flags.work_type = type;
+ set_tsk_thread_flag(task, TIF_PERFMON_WORK);
+}
+
+#define PFM_PMC_STK_ARG PFM_ARCH_PMC_STK_ARG
+#define PFM_PMD_STK_ARG PFM_ARCH_PMD_STK_ARG
+
+#endif /* CONFIG_PERFMON */
+
+#endif /* __PERFMON_PRIV_H__ */
diff --git a/perfmon/perfmon_res.c b/perfmon/perfmon_res.c
new file mode 100644
index 000000000000..0af9dfa98b22
--- /dev/null
+++ b/perfmon/perfmon_res.c
@@ -0,0 +1,223 @@
+/*
+ * perfmon_res.c: perfmon2 resource allocations
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * http://perfmon2.sf.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/perfmon_kern.h>
+#include "perfmon_priv.h"
+
+/*
+ * global information about all sessions
+ */
+struct pfm_resources {
+ cpumask_t sys_cpumask; /* bitmask of used cpus */
+ u32 thread_sessions; /* #num loaded per-thread sessions */
+};
+
+static struct pfm_resources pfm_res;
+
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_res_lock);
+
+/**
+ * pfm_session_acquire - reserve a per-thread session
+ *
+ * return:
+ * 0 : success
+ * -EBUSY: if conflicting session exist
+ */
+int pfm_session_acquire(void)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ /*
+ * validy checks on cpu_mask have been done upstream
+ */
+ spin_lock_irqsave(&pfm_res_lock, flags);
+
+ PFM_DBG("in thread=%u",
+ pfm_res.thread_sessions);
+
+ pfm_res.thread_sessions++;
+
+ PFM_DBG("out thread=%u ret=%d",
+ pfm_res.thread_sessions,
+ ret);
+
+ spin_unlock_irqrestore(&pfm_res_lock, flags);
+
+ return ret;
+}
+
+/**
+ * pfm_session_release - release a per-thread session
+ *
+ * called from __pfm_unload_context()
+ */
+void pfm_session_release(void)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&pfm_res_lock, flags);
+
+ PFM_DBG("in thread=%u",
+ pfm_res.thread_sessions);
+
+ pfm_res.thread_sessions--;
+
+ PFM_DBG("out thread=%u",
+ pfm_res.thread_sessions);
+
+ spin_unlock_irqrestore(&pfm_res_lock, flags);
+}
+
+/**
+ * pfm_session_allcpus_acquire - acquire per-cpu sessions on all available cpus
+ *
+ * currently used by Oprofile on X86
+ */
+int pfm_session_allcpus_acquire(void)
+{
+ unsigned long flags;
+ u32 nsys_cpus, cpu;
+ int ret = -EBUSY;
+
+ spin_lock_irqsave(&pfm_res_lock, flags);
+
+ nsys_cpus = cpus_weight(pfm_res.sys_cpumask);
+
+ PFM_DBG("in sys=%u task=%u",
+ nsys_cpus,
+ pfm_res.thread_sessions);
+
+ if (nsys_cpus) {
+ PFM_DBG("already some system-wide sessions");
+ goto abort;
+ }
+
+ /*
+ * cannot mix system wide and per-task sessions
+ */
+ if (pfm_res.thread_sessions) {
+ PFM_DBG("%u conflicting thread_sessions",
+ pfm_res.thread_sessions);
+ goto abort;
+ }
+
+ for_each_online_cpu(cpu) {
+ cpu_set(cpu, pfm_res.sys_cpumask);
+ nsys_cpus++;
+ }
+
+ PFM_DBG("out sys=%u task=%u",
+ nsys_cpus,
+ pfm_res.thread_sessions);
+
+ ret = 0;
+abort:
+ spin_unlock_irqrestore(&pfm_res_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(pfm_session_allcpus_acquire);
+
+/**
+ * pfm_session_allcpus_release - relase per-cpu sessions on all cpus
+ *
+ * currently used by Oprofile code
+ */
+void pfm_session_allcpus_release(void)
+{
+ unsigned long flags;
+ u32 nsys_cpus, cpu;
+
+ spin_lock_irqsave(&pfm_res_lock, flags);
+
+ nsys_cpus = cpus_weight(pfm_res.sys_cpumask);
+
+ PFM_DBG("in sys=%u task=%u",
+ nsys_cpus,
+ pfm_res.thread_sessions);
+
+ /*
+ * XXX: could use __cpus_clear() with nbits
+ */
+ for_each_online_cpu(cpu) {
+ cpu_clear(cpu, pfm_res.sys_cpumask);
+ nsys_cpus--;
+ }
+
+ PFM_DBG("out sys=%u task=%u",
+ nsys_cpus,
+ pfm_res.thread_sessions);
+
+ spin_unlock_irqrestore(&pfm_res_lock, flags);
+}
+EXPORT_SYMBOL(pfm_session_allcpus_release);
+
+/**
+ * pfm_sysfs_res_show - return currnt resourcde usage for sysfs
+ * @buf: buffer to hold string in return
+ * @sz: size of buf
+ * @what: what to produce
+ * what=0 : thread_sessions
+ * what=1 : cpus_weight(sys_cpumask)
+ * what=2 : smpl_buf_mem_cur
+ * what=3 : pmu model name
+ *
+ * called from perfmon_sysfs.c
+ * return number of bytes written into buf (up to sz)
+ */
+ssize_t pfm_sysfs_res_show(char *buf, size_t sz, int what)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&pfm_res_lock, flags);
+
+ switch (what) {
+ case 0: snprintf(buf, sz, "%u\n", pfm_res.thread_sessions);
+ break;
+ case 1: snprintf(buf, sz, "%d\n", cpus_weight(pfm_res.sys_cpumask));
+ break;
+ case 3:
+ snprintf(buf, sz, "%s\n",
+ pfm_pmu_conf ? pfm_pmu_conf->pmu_name
+ : "unknown\n");
+ }
+ spin_unlock_irqrestore(&pfm_res_lock, flags);
+ return strlen(buf);
+}
diff --git a/perfmon/perfmon_rw.c b/perfmon/perfmon_rw.c
new file mode 100644
index 000000000000..bea77d455794
--- /dev/null
+++ b/perfmon/perfmon_rw.c
@@ -0,0 +1,449 @@
+/*
+ * perfmon.c: perfmon2 PMC/PMD read/write system calls
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * http://perfmon2.sf.net/
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/perfmon_kern.h>
+#include "perfmon_priv.h"
+
+/**
+ * is_invalid -- check if register index is within limits
+ * @cnum: register index
+ * @impl: bitmask of implemented registers
+ * @max: highest implemented registers + 1
+ *
+ * return:
+ * 0 is register index is valid
+ * 1 if invalid
+ */
+static inline int is_invalid(u16 cnum, u64 *impl, u16 max)
+{
+ return cnum >= max || !pfm_arch_bv_test_bit(cnum, impl);
+}
+
+/**
+ * update_used_reg -- updated used_pmcs for a single PMD
+ * @set: set to update
+ * @cnum: new PMD to add
+ *
+ * This function adds the pmds and pmcs depending on PMD cnum
+ */
+static inline void update_used_reg(struct pfm_context *ctx,
+ struct pfm_event_set *set, u16 cnum)
+{
+ pfm_arch_bv_or(set->used_pmcs,
+ set->used_pmcs,
+ pfm_pmu_conf->pmd_desc[cnum].dep_pmcs,
+ ctx->regs.max_pmc);
+}
+
+/**
+ * update_changes -- update nused_pmcs, nused_pmds, write newly touched pmcs
+ * @ctx: context to use
+ * @set: event set to use
+ * @old_used_pmcs: former used_pmc bitmask
+ *
+ * This function updates nused_pmcs and nused_pmds after the last modificiation
+ * to an event set. When new pmcs are used, then they must be initialized such
+ * that we do not pick up stale values from another session.
+ */
+static inline int update_changes(struct pfm_context *ctx, struct pfm_event_set *set,
+ u64 *old_used_pmcs)
+{
+ struct pfarg_pmr req;
+ u16 max_pmc, max_pmd;
+ int n, p, q, ret = 0;
+
+ max_pmd = ctx->regs.max_pmd;
+ max_pmc = ctx->regs.max_pmc;
+
+ /*
+ * update used counts
+ */
+ set->nused_pmds = pfm_arch_bv_weight(set->used_pmds, max_pmd);
+ set->nused_pmcs = pfm_arch_bv_weight(set->used_pmcs, max_pmc);
+
+ PFM_DBG("u_pmds=0x%llx nu_pmds=%u u_pmcs=0x%llx nu_pmcs=%u",
+ (unsigned long long)set->used_pmds[0],
+ set->nused_pmds,
+ (unsigned long long)set->used_pmcs[0],
+ set->nused_pmcs);
+
+ memset(&req, 0, sizeof(req));
+
+ n = pfm_arch_bv_weight(set->used_pmcs, max_pmc);
+ for(p = 0; n; n--, p = q+1) {
+ q = pfm_arch_bv_find_next_bit(set->used_pmcs, max_pmc, p);
+
+ if (pfm_arch_bv_test_bit(q, old_used_pmcs))
+ continue;
+
+ req.reg_num = q;
+ req.reg_value = set->pmcs[q];
+
+ ret = __pfm_write_pmcs(ctx, &req, 1);
+ if (ret)
+ break;
+ }
+ return ret;
+}
+
+/**
+ * __pfm_write_pmds - modify data registers
+ * @ctx: context to operate on
+ * @req: pfarg_pmd_t request from user
+ * @count: number of element in the pfarg_pmd_t vector
+ *
+ * The function succeeds whether the context is attached or not.
+ * When attached to another thread, that thread must be stopped.
+ *
+ * The context is locked and interrupts are disabled.
+ */
+int __pfm_write_pmds(struct pfm_context *ctx, struct pfarg_pmr *req, int count)
+{
+ struct pfm_event_set *set;
+ u64 old_used_pmcs[PFM_PMC_BV];
+ u64 value, ovfl_mask;
+ u64 *impl_pmds;
+ u16 cnum, pmd_type, max_pmd;
+ int i, can_access_pmu;
+ int ret;
+ pfm_pmd_check_t wr_func;
+
+ ovfl_mask = pfm_pmu_conf->ovfl_mask;
+ max_pmd = ctx->regs.max_pmd;
+ impl_pmds = ctx->regs.pmds;
+ wr_func = pfm_pmu_conf->pmd_write_check;
+
+ can_access_pmu = 0;
+
+ /*
+ * we cannot access the actual PMD registers when monitoring is masked
+ */
+ if (unlikely(ctx->state == PFM_CTX_LOADED))
+ can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task;
+
+ ret = -EINVAL;
+ set = ctx->active_set;
+
+ pfm_arch_bv_copy(old_used_pmcs, set->used_pmcs,
+ ctx->regs.max_pmc);
+
+ for (i = 0; i < count; i++, req++) {
+
+ cnum = req->reg_num;
+
+ /*
+ * cannot write to unexisting
+ * writes to read-only register are ignored
+ */
+ if (unlikely(is_invalid(cnum, impl_pmds, max_pmd))) {
+ PFM_DBG("pmd%u is not available", cnum);
+ goto error;
+ }
+
+ pmd_type = pfm_pmu_conf->pmd_desc[cnum].type;
+
+ /*
+ * execute write checker, if any
+ */
+ if (unlikely(wr_func && (pmd_type & PFM_REG_WC))) {
+ ret = (*wr_func)(ctx, set, req);
+ if (ret)
+ goto error;
+
+ }
+
+ value = req->reg_value;
+
+ /*
+ * we reprogram the PMD hence, we clear any pending
+ * ovfl. Does affect ovfl switch on restart but new
+ * value has already been established here
+ */
+ if (pfm_arch_bv_test_bit(cnum, set->povfl_pmds)) {
+ set->npend_ovfls--;
+ pfm_arch_bv_clear_bit(cnum, set->povfl_pmds);
+ }
+
+ /*
+ * update value
+ */
+ set->pmds[cnum] = value;
+
+ pfm_arch_bv_set_bit(cnum, set->used_pmds);
+ update_used_reg(ctx, set, cnum);
+
+ set->priv_flags |= PFM_SETFL_PRIV_MOD_PMDS;
+ if (can_access_pmu)
+ pfm_write_pmd(ctx, cnum, value);
+
+ /*
+ * update number of used PMD registers
+ */
+ set->nused_pmds = pfm_arch_bv_weight(set->used_pmds,
+ max_pmd);
+
+ PFM_DBG("pmd%u=0x%llx a_pmu=%d "
+ "ctx_pmd=0x%llx "
+ " u_pmds=0x%llx nu_pmds=%u ",
+ cnum,
+ (unsigned long long)value,
+ can_access_pmu,
+ (unsigned long long)set->pmds[cnum],
+ (unsigned long long)set->used_pmds[0],
+ set->nused_pmds);
+ }
+ ret = 0;
+error:
+ update_changes(ctx, set, old_used_pmcs);
+ /*
+ * make changes visible
+ */
+ if (can_access_pmu)
+ pfm_arch_serialize();
+
+ return ret;
+}
+
+/**
+ * __pfm_write_pmcs - modify config registers
+ * @ctx: context to operate on
+ * @req: pfarg_pmc_t request from user
+ * @count: number of element in the pfarg_pmc_t vector
+ *
+ *
+ * The function succeeds whether the context is * attached or not.
+ * When attached to another thread, that thread must be stopped.
+ *
+ * The context is locked and interrupts are disabled.
+ */
+int __pfm_write_pmcs(struct pfm_context *ctx, struct pfarg_pmr *req, int count)
+{
+ struct pfm_event_set *set;
+ u64 value, dfl_val, rsvd_msk;
+ u64 *impl_pmcs;
+ int i, can_access_pmu;
+ int ret;
+ u16 cnum, pmc_type, max_pmc;
+ pfm_pmc_check_t wr_func;
+
+ wr_func = pfm_pmu_conf->pmc_write_check;
+ max_pmc = ctx->regs.max_pmc;
+ impl_pmcs = ctx->regs.pmcs;
+
+ can_access_pmu = 0;
+
+ /*
+ * we cannot access the actual PMC registers when monitoring is masked
+ */
+ if (unlikely(ctx->state == PFM_CTX_LOADED))
+ can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task;
+
+ ret = -EINVAL;
+ set = ctx->active_set;
+
+ for (i = 0; i < count; i++, req++) {
+
+ cnum = req->reg_num;
+ value = req->reg_value;
+
+ /*
+ * no access to unavailable PMC register
+ */
+ if (unlikely(is_invalid(cnum, impl_pmcs, max_pmc))) {
+ PFM_DBG("pmc%u is not available", cnum);
+ goto error;
+ }
+
+ pmc_type = pfm_pmu_conf->pmc_desc[cnum].type;
+ dfl_val = pfm_pmu_conf->pmc_desc[cnum].dfl_val;
+ rsvd_msk = pfm_pmu_conf->pmc_desc[cnum].rsvd_msk;
+
+ /*
+ * set reserved bits to default values
+ * (reserved bits must be 1 in rsvd_msk)
+ */
+ value = (value & ~rsvd_msk) | (dfl_val & rsvd_msk);
+
+ /*
+ * execute write checker, if any
+ */
+ if (likely(wr_func && (pmc_type & PFM_REG_WC))) {
+ req->reg_value = value;
+ ret = (*wr_func)(ctx, set, req);
+ if (ret)
+ goto error;
+ value = req->reg_value;
+ }
+
+ /*
+ * Now we commit the changes
+ */
+
+ /*
+ * mark PMC register as used
+ * We do not track associated PMC register based on
+ * the fact that they will likely need to be written
+ * in order to become useful at which point the statement
+ * below will catch that.
+ *
+ * The used_pmcs bitmask is only useful on architectures where
+ * the PMC needs to be modified for particular bits, especially
+ * on overflow or to stop/start.
+ */
+ if (!pfm_arch_bv_test_bit(cnum, set->used_pmcs)) {
+ pfm_arch_bv_set_bit(cnum, set->used_pmcs);
+ set->nused_pmcs++;
+ }
+
+ set->pmcs[cnum] = value;
+
+ set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS;
+ if (can_access_pmu)
+ pfm_arch_write_pmc(ctx, cnum, value);
+
+ PFM_DBG("pmc%u=0x%llx a_pmu=%d "
+ "u_pmcs=0x%llx nu_pmcs=%u",
+ cnum,
+ (unsigned long long)value,
+ can_access_pmu,
+ (unsigned long long)set->used_pmcs[0],
+ set->nused_pmcs);
+ }
+ ret = 0;
+error:
+ /*
+ * make sure the changes are visible
+ */
+ if (can_access_pmu)
+ pfm_arch_serialize();
+
+ return ret;
+}
+
+/**
+ * __pfm_read_pmds - read data registers
+ * @ctx: context to operate on
+ * @req: pfarg_pmd_t request from user
+ * @count: number of element in the pfarg_pmd_t vector
+ *
+ *
+ * The function succeeds whether the context is attached or not.
+ * When attached to another thread, that thread must be stopped.
+ *
+ * The context is locked and interrupts are disabled.
+ */
+int __pfm_read_pmds(struct pfm_context *ctx, struct pfarg_pmr *req, int count)
+{
+ u64 val = 0, ovfl_mask, hw_val;
+ u64 *impl_pmds;
+ struct pfm_event_set *set;
+ int i, ret, can_access_pmu = 0;
+ u16 cnum, pmd_type, max_pmd;
+
+ ovfl_mask = pfm_pmu_conf->ovfl_mask;
+ impl_pmds = ctx->regs.pmds;
+ max_pmd = ctx->regs.max_pmd;
+
+ if (likely(ctx->state == PFM_CTX_LOADED)) {
+ can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task;
+ if (can_access_pmu)
+ pfm_arch_serialize();
+ }
+
+ /*
+ * on both UP and SMP, we can only read the PMD from the hardware
+ * register when the task is the owner of the local PMU.
+ */
+ ret = -EINVAL;
+ set = ctx->active_set;
+
+ for (i = 0; i < count; i++, req++) {
+
+ cnum = req->reg_num;
+
+ if (unlikely(is_invalid(cnum, impl_pmds, max_pmd))) {
+ PFM_DBG("pmd%u is not implemented/unaccessible", cnum);
+ goto error;
+ }
+
+ pmd_type = pfm_pmu_conf->pmd_desc[cnum].type;
+
+ /*
+ * it is not possible to read a PMD which was not requested:
+ * - explicitly written via pfm_write_pmds()
+ * - provided as a reg_smpl_pmds[] to another PMD during
+ * pfm_write_pmds()
+ *
+ * This is motivated by security and for optimization purposes:
+ * - on context switch restore, we can restore only what
+ * we use (except when regs directly readable at user
+ * level, e.g., IA-64 self-monitoring, I386 RDPMC).
+ * - do not need to maintain PMC -> PMD dependencies
+ */
+ if (unlikely(!pfm_arch_bv_test_bit(cnum, set->used_pmds))) {
+ PFM_DBG("pmd%u cannot read, because not used", cnum);
+ goto error;
+ }
+
+ val = set->pmds[cnum];
+
+ /*
+ * If the task is not the current one, then we check if the
+ * PMU state is still in the local live register due to lazy
+ * ctxsw. If true, then we read directly from the registers.
+ */
+ if (can_access_pmu) {
+ hw_val = pfm_read_pmd(ctx, cnum);
+ if (pmd_type & PFM_REG_C64)
+ val = (val & ~ovfl_mask)
+ | (hw_val & ovfl_mask);
+ else
+ val = hw_val;
+ }
+
+ PFM_DBG("pmd%u=0x%llx ",
+ cnum,
+ (unsigned long long)val);
+
+ req->reg_value = val;
+ }
+ ret = 0;
+error:
+ return ret;
+}
diff --git a/perfmon/perfmon_syscalls.c b/perfmon/perfmon_syscalls.c
new file mode 100644
index 000000000000..5c900bb05ad9
--- /dev/null
+++ b/perfmon/perfmon_syscalls.c
@@ -0,0 +1,741 @@
+/*
+ * perfmon_syscalls.c: perfmon2 system call interface
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * http://perfmon2.sf.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/ptrace.h>
+#include <linux/perfmon_kern.h>
+#include <linux/uaccess.h>
+#include "perfmon_priv.h"
+
+/*
+ * Context locking rules:
+ * ---------------------
+ * - any thread with access to the file descriptor of a context can
+ * potentially issue perfmon calls
+ *
+ * - calls must be serialized to guarantee correctness
+ *
+ * - as soon as a context is attached to a thread or CPU, it may be
+ * actively monitoring. On some architectures, such as IA-64, this
+ * is true even though the pfm_start() call has not been made. This
+ * comes from the fact that on some architectures, it is possible to
+ * start/stop monitoring from userland.
+ *
+ * - If monitoring is active, then there can PMU interrupts. Because
+ * context accesses must be serialized, the perfmon system calls
+ * must mask interrupts as soon as the context is attached.
+ *
+ * - perfmon system calls that operate with the context unloaded cannot
+ * assume it is actually unloaded when they are called. They first need
+ * to check and for that they need interrupts masked. Then, if the
+ * context is actually unloaded, they can unmask interrupts.
+ *
+ * - interrupt masking holds true for other internal perfmon functions as
+ * well. Except for PMU interrupt handler because those interrupts
+ * cannot be nested.
+ *
+ * - we mask ALL interrupts instead of just the PMU interrupt because we
+ * also need to protect against timer interrupts which could trigger
+ * a set switch.
+ */
+
+struct pfm_syscall_cookie {
+ struct file *filp;
+ int fput_needed;
+};
+
+/*
+ * cannot attach if :
+ * - kernel task
+ * - task not owned by caller (checked by ptrace_may_attach())
+ * - task is dead or zombie
+ * - cannot use blocking notification when self-monitoring
+ */
+static int pfm_task_incompatible(struct pfm_context *ctx,
+ struct task_struct *task)
+{
+ /*
+ * cannot attach to a kernel thread
+ */
+ if (!task->mm) {
+ PFM_DBG("cannot attach to kernel thread [%d]", task->pid);
+ return -EPERM;
+ }
+
+ /*
+ * cannot attach to a zombie task
+ */
+ if (task->exit_state == EXIT_ZOMBIE || task->exit_state == EXIT_DEAD) {
+ PFM_DBG("cannot attach to zombie/dead task [%d]", task->pid);
+ return -EBUSY;
+ }
+ return 0;
+}
+
+/**
+ * pfm_get_task -- check permission and acquire task to monitor
+ * @ctx: perfmon context
+ * @pid: identification of the task to check
+ * @task: upon return, a pointer to the task to monitor
+ *
+ * This function is used in per-thread mode only AND when not
+ * self-monitoring. It finds the task to monitor and checks
+ * that the caller has permissions to attach. It also checks
+ * that the task is stopped via ptrace so that we can safely
+ * modify its state.
+ *
+ * task refcount is incremented when succesful.
+ */
+static int pfm_get_task(struct pfm_context *ctx, pid_t pid,
+ struct task_struct **task)
+{
+ struct task_struct *p;
+ int ret = 0, ret1 = 0;
+
+ /*
+ * When attaching to another thread we must ensure
+ * that the thread is actually stopped. Just like with
+ * perfmon system calls, we enforce that the thread
+ * be ptraced and STOPPED by using ptrace_check_attach().
+ *
+ * As a consequence, only the ptracing parent can actually
+ * attach a context to a thread. Obviously, this constraint
+ * does not exist for self-monitoring threads.
+ *
+ * We use ptrace_may_access() to check for permission.
+ */
+ read_lock(&tasklist_lock);
+
+ p = find_task_by_vpid(pid);
+ if (p)
+ get_task_struct(p);
+
+ read_unlock(&tasklist_lock);
+
+ if (!p) {
+ PFM_DBG("task not found %d", pid);
+ return -ESRCH;
+ }
+
+ ret = -EPERM;
+
+ /*
+ * returns 0 if cannot attach
+ */
+ ret1 = ptrace_may_access(p, PTRACE_MODE_ATTACH);
+ if (ret1)
+ ret = ptrace_check_attach(p, 0);
+
+ PFM_DBG("may_attach=%d check_attach=%d", ret1, ret);
+
+ if (ret || !ret1)
+ goto error;
+
+ ret = pfm_task_incompatible(ctx, p);
+ if (ret)
+ goto error;
+
+ *task = p;
+
+ return 0;
+error:
+ if (!(ret1 || ret))
+ ret = -EPERM;
+
+ put_task_struct(p);
+
+ return ret;
+}
+
+/*
+ * context must be locked when calling this function
+ */
+int __pfm_check_task_state(struct pfm_context *ctx, int check_mask,
+ unsigned long *flags)
+{
+ struct task_struct *task;
+ unsigned long local_flags, new_flags;
+ int state, ret;
+
+recheck:
+ /*
+ * task is NULL for system-wide context
+ */
+ task = ctx->task;
+ state = ctx->state;
+ local_flags = *flags;
+
+ PFM_DBG("state=%d check_mask=0x%x task=[%d]",
+ state, check_mask, task ? task->pid:-1);
+ /*
+ * if the context is detached, then we do not touch
+ * hardware, therefore there is not restriction on when we can
+ * access it.
+ */
+ if (state == PFM_CTX_UNLOADED)
+ return 0;
+ /*
+ * no command can operate on a zombie context.
+ * A context becomes zombie when the file that identifies
+ * it is closed while the context is still attached to the
+ * thread it monitors.
+ */
+ if (state == PFM_CTX_ZOMBIE)
+ return -EINVAL;
+
+ /*
+ * at this point, state is PFM_CTX_LOADED
+ */
+
+ /*
+ * some commands require the context to be unloaded to operate
+ */
+ if (check_mask & PFM_CMD_UNLOADED) {
+ PFM_DBG("state=%d, cmd needs context unloaded", state);
+ return -EBUSY;
+ }
+
+ /*
+ * self-monitoring always ok.
+ */
+ if (task == current)
+ return 0;
+
+ /*
+ * at this point, monitoring another thread
+ */
+
+ /*
+ * When we operate on another thread, we must wait for it to be
+ * stopped and completely off any CPU as we need to access the
+ * PMU state (or machine state).
+ *
+ * A thread can be put in the STOPPED state in various ways
+ * including PTRACE_ATTACH, or when it receives a SIGSTOP signal.
+ * We enforce that the thread must be ptraced, so it is stopped
+ * AND it CANNOT wake up while we operate on it because this
+ * would require an action from the ptracing parent which is the
+ * thread that is calling this function.
+ *
+ * The dependency on ptrace, imposes that only the ptracing
+ * parent can issue command on a thread. This is unfortunate
+ * but we do not know of a better way of doing this.
+ */
+ if (check_mask & PFM_CMD_STOPPED) {
+
+ spin_unlock_irqrestore(&ctx->lock, local_flags);
+
+ /*
+ * check that the thread is ptraced AND STOPPED
+ */
+ ret = ptrace_check_attach(task, 0);
+
+ spin_lock_irqsave(&ctx->lock, new_flags);
+
+ /*
+ * flags may be different than when we released the lock
+ */
+ *flags = new_flags;
+
+ if (ret)
+ return ret;
+ /*
+ * we must recheck to verify if state has changed
+ */
+ if (unlikely(ctx->state != state)) {
+ PFM_DBG("old_state=%d new_state=%d",
+ state,
+ ctx->state);
+ goto recheck;
+ }
+ }
+ return 0;
+}
+
+int pfm_check_task_state(struct pfm_context *ctx, int check_mask,
+ unsigned long *flags)
+{
+ int ret;
+ ret = __pfm_check_task_state(ctx, check_mask, flags);
+ PFM_DBG("ret=%d",ret);
+ return ret;
+}
+
+/**
+ * pfm_get_args - Function used to copy the syscall argument into kernel memory
+ * @ureq: user argument
+ * @sz: user argument size
+ * @lsz: size of stack buffer
+ * @laddr: stack buffer address
+ * @req: point to start of kernel copy of the argument
+ * @ptr_free: address of kernel copy to free
+ *
+ * There are two options:
+ * - use a stack buffer described by laddr (addresses) and lsz (size)
+ * - allocate memory
+ *
+ * return:
+ * < 0 : in case of error (ptr_free may not be updated)
+ * 0 : success
+ * - req: points to base of kernel copy of arguments
+ * - ptr_free: address of buffer to free by caller on exit.
+ * NULL if using the stack buffer
+ *
+ * when ptr_free is not NULL upon return, the caller must kfree()
+ */
+int pfm_get_args(void __user *ureq, size_t sz, size_t lsz, void *laddr,
+ void **req, void **ptr_free)
+{
+ void *addr;
+
+ /*
+ * check syadmin argument limit
+ */
+ if (unlikely(sz > pfm_controls.arg_mem_max)) {
+ PFM_DBG("argument too big %zu max=%zu",
+ sz,
+ pfm_controls.arg_mem_max);
+ return -E2BIG;
+ }
+
+ /*
+ * check if vector fits on stack buffer
+ */
+ if (sz > lsz) {
+ addr = kmalloc(sz, GFP_KERNEL);
+ if (unlikely(addr == NULL))
+ return -ENOMEM;
+ *ptr_free = addr;
+ } else {
+ addr = laddr;
+ *req = laddr;
+ *ptr_free = NULL;
+ }
+
+ /*
+ * bring the data in
+ */
+ if (unlikely(copy_from_user(addr, ureq, sz))) {
+ if (addr != laddr)
+ kfree(addr);
+ return -EFAULT;
+ }
+
+ /*
+ * base address of kernel buffer
+ */
+ *req = addr;
+
+ return 0;
+}
+
+/**
+ * pfm_acquire_ctx_from_fd -- get ctx from file descriptor
+ * @fd: file descriptor
+ * @ctx: pointer to pointer of context updated on return
+ * @cookie: opaque structure to use for release
+ *
+ * This helper function extracts the ctx from the file descriptor.
+ * It also increments the refcount of the file structure. Thus
+ * it updates the cookie so the refcount can be decreased when
+ * leaving the perfmon syscall via pfm_release_ctx_from_fd
+ */
+static int pfm_acquire_ctx_from_fd(int fd, struct pfm_context **ctx,
+ struct pfm_syscall_cookie *cookie)
+{
+ struct file *filp;
+ int fput_needed;
+
+ filp = fget_light(fd, &fput_needed);
+ if (unlikely(filp == NULL)) {
+ PFM_DBG("invalid fd %d", fd);
+ return -EBADF;
+ }
+
+ *ctx = filp->private_data;
+
+ if (unlikely(!*ctx || filp->f_op != &pfm_file_ops)) {
+ PFM_DBG("fd %d not related to perfmon", fd);
+ return -EBADF;
+ }
+ cookie->filp = filp;
+ cookie->fput_needed = fput_needed;
+
+ return 0;
+}
+
+/**
+ * pfm_release_ctx_from_fd -- decrease refcount of file associated with context
+ * @cookie: the cookie structure initialized by pfm_acquire_ctx_from_fd
+ */
+static inline void pfm_release_ctx_from_fd(struct pfm_syscall_cookie *cookie)
+{
+ fput_light(cookie->filp, cookie->fput_needed);
+}
+
+/**
+ * pfm_validate_type_sz -- validate sz based on type
+ * @type : PFM_RW_XX type passed to pfm_write or pfm_read
+ * @sz : vector size in bytes
+ *
+ * return:
+ * the number of elements in the vector, 0 if error
+ */
+static size_t pfm_validate_type_sz(int type, size_t sz)
+{
+ size_t count, sz_type;
+
+ switch(type) {
+ case PFM_RW_PMD:
+ case PFM_RW_PMC:
+ sz_type = sizeof(struct pfarg_pmr);
+ break;
+ default:
+ PFM_DBG("invalid type=%d", type);
+ return 0;
+ }
+
+ count = sz / sz_type;
+
+ if ((count * sz_type) != sz) {
+ PFM_DBG("invalid size=%zu for type=%d", sz, type);
+ return 0;
+ }
+
+ PFM_DBG("sz=%zu sz_type=%zu count=%zu",
+ sz,
+ sz_type,
+ count);
+
+ return count;
+}
+
+/*
+ * unlike the other perfmon system calls, this one returns a file descriptor
+ * or a value < 0 in case of error, very much like open() or socket()
+ */
+asmlinkage long sys_pfm_create(int flags, struct pfarg_sinfo __user *ureq)
+{
+ struct pfm_context *new_ctx;
+ struct pfarg_sinfo sif;
+ int ret;
+
+ PFM_DBG("flags=0x%x sif=%p", flags, ureq);
+
+ if (perfmon_disabled)
+ return -ENOSYS;
+
+ if (flags) {
+ PFM_DBG("no flags accepted yet");
+ return -EINVAL;
+ }
+ ret = __pfm_create_context(flags, &sif, &new_ctx);
+
+ /*
+ * copy sif to user level argument, if requested
+ */
+ if (ureq && copy_to_user(ureq, &sif, sizeof(sif))) {
+ pfm_undo_create(ret, new_ctx);
+ ret = -EFAULT;
+ }
+ return ret;
+}
+
+asmlinkage long sys_pfm_write(int fd, int uflags,
+ int type,
+ void __user *ureq,
+ size_t sz)
+{
+ u64 buf[PFM_STK_ARG];
+ struct pfm_context *ctx;
+ struct pfm_syscall_cookie cookie;
+ void *req, *fptr;
+ unsigned long flags;
+ size_t count;
+ int ret;
+
+ PFM_DBG("fd=%d flags=0x%x type=%d req=%p sz=%zu",
+ fd, uflags, type, ureq, sz);
+
+ if (uflags) {
+ PFM_DBG("no flags defined");
+ return -EINVAL;
+ }
+
+ count = pfm_validate_type_sz(type, sz);
+ if (!count)
+ return -EINVAL;
+
+ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie);
+ if (ret)
+ return ret;
+
+ ret = pfm_get_args(ureq, sz, sizeof(buf), buf, (void **)&req, &fptr);
+ if (ret)
+ goto error;
+
+ spin_lock_irqsave(&ctx->lock, flags);
+
+ ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags);
+ if (ret)
+ goto skip;
+ switch(type) {
+ case PFM_RW_PMC:
+ ret = __pfm_write_pmcs(ctx, req, count);
+ break;
+ case PFM_RW_PMD:
+ ret = __pfm_write_pmds(ctx, req, count);
+ break;
+ default:
+ PFM_DBG("invalid type=%d", type);
+ ret = -EINVAL;
+ }
+skip:
+ spin_unlock_irqrestore(&ctx->lock, flags);
+
+ /*
+ * This function may be on the critical path.
+ * We want to avoid the branch if unecessary.
+ */
+ if (fptr)
+ kfree(fptr);
+error:
+ pfm_release_ctx_from_fd(&cookie);
+ return ret;
+}
+
+asmlinkage long sys_pfm_read(int fd, int uflags,
+ int type,
+ void __user *ureq,
+ size_t sz)
+{
+ u64 buf[PFM_STK_ARG];
+ struct pfm_context *ctx;
+ struct pfm_syscall_cookie cookie;
+ void *req, *fptr;
+ unsigned long flags;
+ size_t count;
+ int ret;
+
+ PFM_DBG("fd=%d flags=0x%x type=%d req=%p sz=%zu",
+ fd, uflags, type, ureq, sz);
+
+ if (uflags) {
+ PFM_DBG("no flags defined");
+ return -EINVAL;
+ }
+
+ count = pfm_validate_type_sz(type, sz);
+ if (!count)
+ return -EINVAL;
+
+ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie);
+ if (ret)
+ return ret;
+
+ ret = pfm_get_args(ureq, sz, sizeof(buf), buf, (void **)&req, &fptr);
+ if (ret)
+ goto error;
+
+ spin_lock_irqsave(&ctx->lock, flags);
+
+ ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags);
+ if (ret)
+ goto skip;
+
+ switch(type) {
+ case PFM_RW_PMD:
+ ret = __pfm_read_pmds(ctx, req, count);
+ break;
+ default:
+ PFM_DBG("invalid type=%d", type);
+ ret = -EINVAL;
+ }
+skip:
+ spin_unlock_irqrestore(&ctx->lock, flags);
+
+ if (copy_to_user(ureq, req, sz))
+ ret = -EFAULT;
+
+ if (fptr)
+ kfree(fptr);
+error:
+ pfm_release_ctx_from_fd(&cookie);
+ return ret;
+}
+
+asmlinkage long sys_pfm_set_state(int fd, int uflags, int state)
+{
+ struct pfm_context *ctx;
+ struct pfm_syscall_cookie cookie;
+ unsigned long flags;
+ int ret;
+
+ PFM_DBG("fd=%d uflags=0x%x state=0x%x", fd, uflags, state);
+
+ if (uflags) {
+ PFM_DBG("no flags defined");
+ return -EINVAL;
+ }
+
+ switch(state) {
+ case PFM_ST_START:
+ case PFM_ST_STOP:
+ break;
+ default:
+ PFM_DBG("invalid state=0x%x", state);
+ return -EINVAL;
+ }
+
+ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie);
+ if (ret)
+ return ret;
+
+ spin_lock_irqsave(&ctx->lock, flags);
+
+ ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags);
+ if (!ret) {
+ if (state == PFM_ST_STOP)
+ ret = __pfm_stop(ctx);
+ else
+ ret = __pfm_start(ctx);
+ }
+
+ spin_unlock_irqrestore(&ctx->lock, flags);
+
+ pfm_release_ctx_from_fd(&cookie);
+
+ return ret;
+}
+
+static long pfm_detach(int fd, int uflags)
+{
+ struct pfm_context *ctx;
+ struct pfm_syscall_cookie cookie;
+ unsigned long flags;
+ int ret;
+
+ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie);
+ if (ret)
+ return ret;
+
+ spin_lock_irqsave(&ctx->lock, flags);
+
+ ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED|PFM_CMD_UNLOAD, &flags);
+ if (!ret)
+ ret = __pfm_unload_context(ctx);
+
+ spin_unlock_irqrestore(&ctx->lock, flags);
+
+ /*
+ * if unload was successful, then release the session
+ * must be called with interrupts enabled, thus we need
+ * to defer until are out of __pfm_unload_context()
+ */
+ if (!ret)
+ pfm_session_release();
+
+ pfm_release_ctx_from_fd(&cookie);
+
+ return ret;
+}
+
+asmlinkage long sys_pfm_attach(int fd, int uflags, int target)
+{
+ struct pfm_context *ctx;
+ struct task_struct *task;
+ struct pfm_syscall_cookie cookie;
+ unsigned long flags;
+ int ret;
+
+ PFM_DBG("fd=%d uflags=0x%x target=%d", fd, uflags, target);
+
+ if (uflags) {
+ PFM_DBG("invalid flags");
+ return -EINVAL;
+ }
+
+ /*
+ * handle detach in a separate function
+ */
+ if (target == PFM_NO_TARGET)
+ return pfm_detach(fd, uflags);
+
+ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie);
+ if (ret)
+ return ret;
+
+ task = current;
+
+ /*
+ * in per-thread mode (not self-monitoring), get a reference
+ * on task to monitor. This must be done with interrupts enabled
+ * Upon succesful return, refcount on task has increased.
+ *
+ * fget_light() is protecting the context.
+ */
+ if (target != current->pid) {
+ ret = pfm_get_task(ctx, target, &task);
+ if (ret)
+ goto error;
+ }
+
+ /*
+ * irqsave is required to avoid race in case context is already
+ * loaded or with switch timeout in the case of self-monitoring
+ */
+ spin_lock_irqsave(&ctx->lock, flags);
+
+ ret = pfm_check_task_state(ctx, PFM_CMD_UNLOADED, &flags);
+ if (!ret)
+ ret = __pfm_load_context(ctx, task);
+
+ spin_unlock_irqrestore(&ctx->lock, flags);
+
+ /*
+ * in per-thread mode (not self-monitoring), we need
+ * to decrease refcount on task to monitor:
+ * - attach successful: we have a reference in ctx->task
+ * - attach failed : undo the effect of pfm_get_task()
+ */
+ if (task != current)
+ put_task_struct(task);
+error:
+ pfm_release_ctx_from_fd(&cookie);
+ return ret;
+}
diff --git a/perfmon/perfmon_sysfs.c b/perfmon/perfmon_sysfs.c
new file mode 100644
index 000000000000..b13c12581175
--- /dev/null
+++ b/perfmon/perfmon_sysfs.c
@@ -0,0 +1,344 @@
+/*
+ * perfmon_sysfs.c: perfmon2 sysfs interface
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * http://perfmon2.sf.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/module.h> /* for EXPORT_SYMBOL */
+#include <linux/perfmon_kern.h>
+#include "perfmon_priv.h"
+
+struct pfm_attribute {
+ struct attribute attr;
+ ssize_t (*show)(void *, struct pfm_attribute *attr, char *);
+ ssize_t (*store)(void *, const char *, size_t);
+};
+#define to_attr(n) container_of(n, struct pfm_attribute, attr);
+
+
+#define PFM_RO_ATTR(_name, _show) \
+ struct kobj_attribute attr_##_name = __ATTR(_name, 0444, _show, NULL)
+
+#define PFM_RW_ATTR(_name, _show, _store) \
+ struct kobj_attribute attr_##_name = __ATTR(_name, 0644, _show, _store)
+
+#define PFM_ROS_ATTR(_name, _show) \
+ struct pfm_attribute attr_##_name = __ATTR(_name, 0444, _show, NULL)
+
+#define is_attr_name(a, n) (!strcmp((a)->attr.name, n))
+int pfm_sysfs_add_pmu(struct pfm_pmu_config *pmu);
+
+static struct kobject *pfm_kernel_kobj;
+static struct kobject *pfm_pmu_kobj;
+
+
+static ssize_t pfm_regs_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+#define to_reg(n) container_of(n, struct pfm_regmap_desc, kobj)
+ struct pfm_regmap_desc *reg = to_reg(kobj);
+ struct pfm_attribute *attribute = to_attr(attr);
+ return attribute->show ? attribute->show(reg, attribute, buf) : -EIO;
+}
+
+static struct sysfs_ops pfm_regs_sysfs_ops = {
+ .show = pfm_regs_attr_show
+};
+
+static struct kobj_type pfm_regs_ktype = {
+ .sysfs_ops = &pfm_regs_sysfs_ops,
+};
+
+static ssize_t pfm_controls_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+
+ if (is_attr_name(attr, "version"))
+ return snprintf(buf, PAGE_SIZE, "%u.%u\n", PFM_VERSION_MAJ, PFM_VERSION_MIN);
+
+ if (is_attr_name(attr, "task_sessions_count"))
+ return pfm_sysfs_res_show(buf, PAGE_SIZE, 0);
+
+ if (is_attr_name(attr, "debug"))
+ return snprintf(buf, PAGE_SIZE, "%d\n", pfm_controls.debug);
+
+ if (is_attr_name(attr, "task_group"))
+ return snprintf(buf, PAGE_SIZE, "%d\n", pfm_controls.task_group);
+
+ if (is_attr_name(attr, "arg_mem_max"))
+ return snprintf(buf, PAGE_SIZE, "%zu\n", pfm_controls.arg_mem_max);
+
+ return 0;
+}
+
+static ssize_t pfm_controls_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ size_t d;
+
+ if (sscanf(buf, "%zu", &d) != 1)
+ goto skip;
+
+ if (is_attr_name(attr, "debug"))
+ pfm_controls.debug = d;
+
+ if (is_attr_name(attr, "task_group"))
+ pfm_controls.task_group = d;
+
+ if (is_attr_name(attr, "arg_mem_max")) {
+ /*
+ * we impose a page as the minimum.
+ *
+ * This limit may be smaller than the stack buffer
+ * available and that is fine.
+ */
+ if (d >= PAGE_SIZE)
+ pfm_controls.arg_mem_max = d;
+ }
+
+skip:
+ return count;
+}
+
+/*
+ * /sys/kernel/perfmon attributes
+ */
+static PFM_RO_ATTR(version, pfm_controls_show);
+static PFM_RO_ATTR(task_sessions_count, pfm_controls_show);
+static PFM_RW_ATTR(debug, pfm_controls_show, pfm_controls_store);
+static PFM_RW_ATTR(task_group, pfm_controls_show, pfm_controls_store);
+static PFM_RW_ATTR(arg_mem_max, pfm_controls_show, pfm_controls_store);
+
+static struct attribute *pfm_kernel_attrs[] = {
+ &attr_version.attr,
+ &attr_task_sessions_count.attr,
+ &attr_debug.attr,
+ &attr_task_group.attr,
+ &attr_arg_mem_max.attr,
+ NULL
+};
+
+static struct attribute_group pfm_kernel_attr_group = {
+ .attrs = pfm_kernel_attrs,
+};
+
+/*
+ * per-reg attributes
+ */
+static ssize_t pfm_reg_show(void *data, struct pfm_attribute *attr, char *buf)
+{
+ struct pfm_regmap_desc *reg = data;
+ int w;
+
+ reg = data;
+
+ if (is_attr_name(attr, "name"))
+ return snprintf(buf, PAGE_SIZE, "%s\n", reg->desc);
+
+ if (is_attr_name(attr, "dfl_val"))
+ return snprintf(buf, PAGE_SIZE, "0x%llx\n",
+ (unsigned long long)reg->dfl_val);
+
+ if (is_attr_name(attr, "width")) {
+ w = (reg->type & PFM_REG_C64) ?
+ pfm_pmu_conf->counter_width : 64;
+ return snprintf(buf, PAGE_SIZE, "%d\n", w);
+ }
+
+ if (is_attr_name(attr, "rsvd_msk"))
+ return snprintf(buf, PAGE_SIZE, "0x%llx\n",
+ (unsigned long long)reg->rsvd_msk);
+
+ if (is_attr_name(attr, "addr"))
+ return snprintf(buf, PAGE_SIZE, "0x%lx\n", reg->hw_addr);
+
+ return 0;
+}
+
+static PFM_ROS_ATTR(name, pfm_reg_show);
+static PFM_ROS_ATTR(dfl_val, pfm_reg_show);
+static PFM_ROS_ATTR(rsvd_msk, pfm_reg_show);
+static PFM_ROS_ATTR(width, pfm_reg_show);
+static PFM_ROS_ATTR(addr, pfm_reg_show);
+
+static struct attribute *pfm_reg_attrs[] = {
+ &attr_name.attr,
+ &attr_dfl_val.attr,
+ &attr_rsvd_msk.attr,
+ &attr_width.attr,
+ &attr_addr.attr,
+ NULL
+};
+
+static struct attribute_group pfm_reg_attr_group = {
+ .attrs = pfm_reg_attrs,
+};
+
+static ssize_t pfm_pmu_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ if (is_attr_name(attr, "model"))
+ return snprintf(buf, PAGE_SIZE, "%s\n", pfm_pmu_conf->pmu_name);
+ return 0;
+}
+
+static PFM_RO_ATTR(model, pfm_pmu_show);
+
+static struct attribute *pfm_pmu_desc_attrs[] = {
+ &attr_model.attr,
+ NULL
+};
+
+static struct attribute_group pfm_pmu_desc_attr_group = {
+ .attrs = pfm_pmu_desc_attrs,
+};
+
+static int pfm_sysfs_add_pmu_regs(struct pfm_pmu_config *pmu)
+{
+ struct pfm_regmap_desc *reg;
+ unsigned int i, k;
+ int ret;
+
+ reg = pmu->pmc_desc;
+ for (i = 0; i < pmu->num_pmc_entries; i++, reg++) {
+
+ if (!(reg->type & PFM_REG_I))
+ continue;
+
+ ret = kobject_init_and_add(&reg->kobj, &pfm_regs_ktype,
+ pfm_pmu_kobj, "pmc%u", i);
+ if (ret)
+ goto undo_pmcs;
+
+ ret = sysfs_create_group(&reg->kobj, &pfm_reg_attr_group);
+ if (ret) {
+ kobject_del(&reg->kobj);
+ goto undo_pmcs;
+ }
+ }
+
+ reg = pmu->pmd_desc;
+ for (i = 0; i < pmu->num_pmd_entries; i++, reg++) {
+
+ if (!(reg->type & PFM_REG_I))
+ continue;
+
+ ret = kobject_init_and_add(&reg->kobj, &pfm_regs_ktype,
+ pfm_pmu_kobj, "pmd%u", i);
+ if (ret)
+ goto undo_pmds;
+
+ ret = sysfs_create_group(&reg->kobj, &pfm_reg_attr_group);
+ if (ret) {
+ kobject_del(&reg->kobj);
+ goto undo_pmds;
+ }
+ }
+ return 0;
+undo_pmds:
+ reg = pmu->pmd_desc;
+ for (k = 0; k < i; k++, reg++) {
+ if (!(reg->type & PFM_REG_I))
+ continue;
+ sysfs_remove_group(&reg->kobj, &pfm_reg_attr_group);
+ kobject_del(&reg->kobj);
+ }
+ i = pmu->num_pmc_entries;
+ /* fall through */
+undo_pmcs:
+ reg = pmu->pmc_desc;
+ for (k = 0; k < i; k++, reg++) {
+ if (!(reg->type & PFM_REG_I))
+ continue;
+ sysfs_remove_group(&reg->kobj, &pfm_reg_attr_group);
+ kobject_del(&reg->kobj);
+ }
+ return ret;
+}
+
+/*
+ * when a PMU description module is inserted, we create
+ * a pmu_desc subdir in sysfs and we populate it with
+ * PMU specific information, such as register mappings
+ */
+int pfm_sysfs_add_pmu(struct pfm_pmu_config *pmu)
+{
+ int ret;
+
+ pfm_pmu_kobj = kobject_create_and_add("pmu_desc", pfm_kernel_kobj);
+ if (!pfm_pmu_kobj)
+ return -ENOMEM;
+
+ ret = sysfs_create_group(pfm_pmu_kobj, &pfm_pmu_desc_attr_group);
+ if (ret) {
+ /* will release pfm_pmu_kobj */
+ kobject_put(pfm_pmu_kobj);
+ return ret;
+ }
+
+ ret = pfm_sysfs_add_pmu_regs(pmu);
+ if (ret) {
+ sysfs_remove_group(pfm_pmu_kobj, &pfm_pmu_desc_attr_group);
+ /* will release pfm_pmu_kobj */
+ kobject_put(pfm_pmu_kobj);
+ } else
+ kobject_uevent(pfm_pmu_kobj, KOBJ_ADD);
+
+ return ret;
+}
+
+int __init pfm_init_sysfs(void)
+{
+ int ret;
+
+ /*
+ * dynamic allocation happens on pfm_kernel_kobj,
+ * but a release callback is attached
+ */
+ pfm_kernel_kobj = kobject_create_and_add("perfmon", kernel_kobj);
+ if (!pfm_kernel_kobj) {
+ PFM_ERR("cannot add kernel object");
+ return -ENOMEM;
+ }
+
+ ret = sysfs_create_group(pfm_kernel_kobj, &pfm_kernel_attr_group);
+ if (ret) {
+ kobject_put(pfm_kernel_kobj);
+ return ret;
+ }
+
+ if (pfm_pmu_conf)
+ pfm_sysfs_add_pmu(pfm_pmu_conf);
+
+ return 0;
+}