diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2008-11-11 18:00:33 +1100 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2008-11-11 18:00:33 +1100 |
commit | 6bfea1858872c1e57d94d686e3144bfa10ca48cb (patch) | |
tree | cfe7ad66c1f9a14f9d419c3ebf3100264b0044d4 | |
parent | bc6435afdc6a1e0c5236a4a031f372bc1c62341d (diff) | |
parent | 4872c7055867a9b583c76cd7744030dd515a5f35 (diff) |
Merge commit 'perfmon3/master'
67 files changed, 7196 insertions, 37 deletions
diff --git a/Documentation/ABI/testing/sysfs-perfmon b/Documentation/ABI/testing/sysfs-perfmon new file mode 100644 index 000000000000..79c66b59ec5b --- /dev/null +++ b/Documentation/ABI/testing/sysfs-perfmon @@ -0,0 +1,42 @@ +What: /sys/kernel/perfmon +Date: Oct 2008 +KernelVersion: 2.6.27 +Contact: eranian@gmail.com + +Description: provide the configuration interface for the perfmon subsystems. + The tree contains information about the detected hardware, + current state of the subsystem as well as some configuration + parameters. + + The tree consists of the following entries: + + /sys/kernel/perfmon/debug (read-write): + + Enable perfmon debugging output. The traces are rate-limited + to avoid flooding the console. It is possible to change the + throttling via /proc/sys/kernel/printk_ratelimit. + + The value is interpreted as a bitmask. Each bit enables a + particular type of debug messages. Refer to the file + include/linux/perfmon_kern.h for more information. + + /sys/kernel/perfmon/task_group (read-write): + + Users group allowed to create a per-thread context (session). + -1 means any group. + + /sys/kernel/perfmon/task_sessions_count (read-only): + + Number of per-thread contexts (sessions) currently attached + to threads. + + /sys/kernel/perfmon/version (read-only): + + Perfmon interface revision number. + + /sys/kernel/perfmon/arg_mem_max(read-write): + + Maximum size of vector arguments expressed in bytes. + It can be modified but must be at least a page. + Default: PAGE_SIZE + diff --git a/Documentation/ABI/testing/sysfs-perfmon-pmu b/Documentation/ABI/testing/sysfs-perfmon-pmu new file mode 100644 index 000000000000..2fa5a7ca8e8b --- /dev/null +++ b/Documentation/ABI/testing/sysfs-perfmon-pmu @@ -0,0 +1,48 @@ +What: /sys/kernel/perfmon/pmu +Date: Nov 2007 +KernelVersion: 2.6.24 +Contact: eranian@gmail.com + +Description: Provides information about the active PMU description + module. The module contains the mapping of the actual + performance counter registers onto the logical PMU exposed by + perfmon. There is at most one PMU description module loaded + at any time. + + The sysfs PMU tree provides a description of the mapping for + each register. There is one subdir per config and data register + along an entry for the name of the PMU model. + + The entries are as follows: + + /sys/kernel/perfmon/pmu_desc/model (read-only): + + Name of the PMU model is clear text and zero terminated. + + Then, for each logical PMU register, XX, gets a subtree with the + following entries: + + /sys/kernel/perfmon/pmu_desc/pm*XX/addr (read-only): + + The physical address or index of the actual underlying hardware + register. On Itanium, it corresponds to the index. But on X86 + processor, this is the actual MSR address. + + /sys/kernel/perfmon/pmu_desc/pm*XX/dfl_val (read-only): + + The default value of the register in hexadecimal. + + /sys/kernel/perfmon/pmu_desc/pm*XX/name (read-only): + + The name of the hardware register. + + /sys/kernel/perfmon/pmu_desc/pm*XX/rsvd_msk (read-only): + + Bitmask of reserved bits, i.e., bits which cannot be changed + by applications. When a bit is set, it means the corresponding + bit in the actual register is reserved. + + /sys/kernel/perfmon/pmu_desc/pm*XX/width (read-only): + + The width in bits of the registers. This field is only + relevant for counter registers. diff --git a/Documentation/perfmon.txt b/Documentation/perfmon.txt new file mode 100644 index 000000000000..818c53770e8b --- /dev/null +++ b/Documentation/perfmon.txt @@ -0,0 +1,206 @@ + The perfmon hardware monitoring interface + ------------------------------------------ + Stephane Eranian + <eranian@gmail.com> + +I/ Introduction + + The perfmon interface provides access to the hardware performance counters + of major processors. Nowadays, all processors implement some flavor of + performance counters which capture micro-architectural level information + such as the number of elapsed cycles, number of cache misses, and so on. + + The interface is implemented as a set of new system calls and a set of + config files in /sys. + + It is possible to monitor a single thread or a CPU. In either mode, + applications can count or sample. System-wide monitoring is supported by + running a monitoring session on each CPU. The interface supports event-based + sampling where the sampling period is expressed as the number of occurrences + of event, instead of just a timeout. This approach provides a better + granularity and flexibility. + + For performance reason, it is possible to use a kernel-level sampling buffer + to minimize the overhead incurred by sampling. The format of the buffer, + what is recorded, how it is recorded, and how it is exported to user is + controlled by a kernel module called a sampling format. The current + implementation comes with a default format but it is possible to create + additional formats. There is an kernel registration interface for formats. + Each format is identified by a simple string which a tool can pass when a + monitoring session is created. + + The interface also provides support for event set and multiplexing to work + around hardware limitations in the number of available counters or in how + events can be combined. Each set defines as many counters as the hardware + can support. The kernel then multiplexes the sets. The interface supports + time-based switching but also overflow-based switching, i.e., after n + overflows of designated counters. + + Applications never manipulates the actual performance counter registers. + Instead they see a logical Performance Monitoring Unit (PMU) composed of a + set of config registers (PMC) and a set of data registers (PMD). Note that + PMD are not necessarily counters, they can be buffers. The logical PMU is + then mapped onto the actual PMU using a mapping table which is implemented + as a kernel module. The mapping is chosen once for each new processor. It is + visible in /sys/kernel/perfmon/pmu_desc. The kernel module is automatically + loaded on first use. + + A monitoring session is uniquely identified by a file descriptor obtained + when the session is created. File sharing semantics apply to access the + session inside a process. A session is never inherited across fork. The file + descriptor can be used to receive counter overflow notifications or when the + sampling buffer is full. It is possible to use poll/select on the descriptor + to wait for notifications from multiple sessions. Similarly, the descriptor + supports asynchronous notifications via SIGIO. + + Counters are always exported as being 64-bit wide regardless of what the + underlying hardware implements. + +II/ Kernel compilation + + To enable perfmon, you need to enable CONFIG_PERFMON and also some of the + model-specific PMU modules. + +III/ OProfile interactions + + The set of features offered by perfmon is rich enough to support migrating + Oprofile on top of it. That means that PMU programming and low-level + interrupt handling could be done by perfmon. The Oprofile sampling buffer + management code in the kernel as well as how samples are exported to users + could remain through the use of a sampling format. This is how Oprofile + works on Itanium. + + The current interactions with Oprofile are: + - on X86: Both subsystems can be compiled into the same kernel. There + is enforced mutual exclusion between the two subsystems. When + there is an Oprofile session, no perfmon session can exist + and vice-versa. + + - On IA-64: Oprofile works on top of perfmon. Oprofile being a + system-wide monitoring tool, the regular per-thread vs. + system-wide session restrictions apply. + + - on PPC: no integration yet. Only one subsystem can be enabled. + - on MIPS: no integration yet. Only one subsystem can be enabled. + +IV/ User tools + + We have released a simple monitoring tool to demonstrate the features of + the interface. The tool is called pfmon and it comes with a simple helper + library called libpfm. The library comes with a set of examples to show + how to use the kernel interface. Visit http://perfmon2.sf.net for details. + + There maybe other tools available for perfmon. + +V/ How to program? + + The best way to learn how to program perfmon, is to take a look at the + source code for the examples in libpfm. The source code is available from: + + http://perfmon2.sf.net + +VI/ System calls overview + + In this section, we describe the state of the interface as submitted to the + kernel. There are more extensions available, and we will update the section + as they get implemented in the upstream kernel. + + The interface is implemented by the following system calls: + + * int pfm_create(int flags, pfarg_sinfo_t *s); + + This function creates a perfmon per-thread session. + The flags parameter is currently unused and must be set to 0. + + Upon return and if s is not NULL, the kernel return the list of available + PMC and PMD registers. Tools should not assume, they have access to the + entire PMU, it may be shared with other kernel subsystems, e.g., on X86 + the NMI watchdog timer. + + The function returns the file descriptor identifying the session. + + * int pfm_write(int fd, int flags, int type, void *d, size_t sz) + + This function is used to write PMU registers for the session identified + by fd. + + The flags parameter is currently unused and must be set to 0. + + The type reflects the type of registers to write and determines the type + of the d parameter. The following types are defined: + + - PFM_RW_PMC: write PMC registers, expect pfarg_pmr_t pointer for d + - PFM_RW_PMD: write PMD registers, expect pfarg_pmr_t pointer for d + + The type field is not a bitmask, only one type can be passed per call. + + the sz parameter describes the size of the vector of elements passed in d. + + * int pfm_read(int fd, int flags, int type, void *d, size_t sz); + + This function is used to read PMU registers for the session identified + by fd. + + This function is used to write PMU registers for the session identified + by fd. + + The flags parameter is currently unused and must be set to 0. + + The type reflects the type of registers to write and determines the type + of the d parameter. The following types are supported: + + - PFM_RW_PMD: write PMD registers, expect pfarg_pmr_t pointer for d + + The type field is not a bitmask, only one type can be passed per call. + + Reading of PMC registers is not allowed. + + the sz parameter describes the size of the vector of elements passed in d. + + + * int pfm_attach(int fd, int flags, int target); + + This function is used to attach and detach the session to and from + thread. + + To attach the thread is identified by target which must have the + value returned by gettid() (not pthread_self). For a single threaded + process, that value is equal to the value returned by getpid(). + + To detach, the special target PFM_NO_TARGET must be passed. + + The flags parameter is currently unused and must be set to 0. + + The session is always attached as stopped, i.e., with monitoring + inactive. Monitoring is always stopped as a consequence of detaching. + + * int pfm_set_state(int fd, int flags, int state); + + The function is used to set the running state of the session. The state to + go to is indicated by state. + + The following states are defined, only one can be specified at a time: + + - PFM_ST_START: start monitoring + - PFM_ST_STOP: stop monitoring + + The flags parameter is currently unused and must be set to 0. + + * int close(int fd) + + To destroy a session, the regular close() system call is used. + + +VII/ /sys interface overview + + Refer to Documentation/ABI/testing/sysfs-perfmon-* for a detailed + description of the sysfs interface of perfmon2. + +VIII/ debugfs interface overview + + Refer to Documentation/perfmon-debugfs.txt for a detailed description of the + debug and statistics interface of perfmon. + +IX/ Documentation + + Visit http://perfmon2.sf.net @@ -621,6 +621,8 @@ export mod_strip_cmd ifeq ($(KBUILD_EXTMOD),) core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ +core-$(CONFIG_PERFMON) += perfmon/ + vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \ $(core-y) $(core-m) $(drivers-y) $(drivers-m) \ $(net-y) $(net-m) $(libs-y) $(libs-m))) diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 6bd91ed7cd03..ad604df6a2b6 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -505,7 +505,7 @@ config COMPAT_FOR_U64_ALIGNMENT config IA64_MCA_RECOVERY tristate "MCA recovery from errors other than TLB." -config PERFMON +config PERFMON_V20 bool "Performance monitor support" help Selects whether support for the IA-64 performance monitor hardware diff --git a/arch/ia64/configs/bigsur_defconfig b/arch/ia64/configs/bigsur_defconfig index 6dd8655664f3..2c04fbe6c414 100644 --- a/arch/ia64/configs/bigsur_defconfig +++ b/arch/ia64/configs/bigsur_defconfig @@ -134,7 +134,7 @@ CONFIG_ARCH_SPARSEMEM_ENABLE=y CONFIG_IA32_SUPPORT=y CONFIG_COMPAT=y # CONFIG_IA64_MCA_RECOVERY is not set -CONFIG_PERFMON=y +CONFIG_PERFMON_V20=y CONFIG_IA64_PALINFO=y # diff --git a/arch/ia64/configs/generic_defconfig b/arch/ia64/configs/generic_defconfig index e05f9e1d3faa..7d89a19fc8b3 100644 --- a/arch/ia64/configs/generic_defconfig +++ b/arch/ia64/configs/generic_defconfig @@ -209,7 +209,7 @@ CONFIG_IA32_SUPPORT=y CONFIG_COMPAT=y CONFIG_COMPAT_FOR_U64_ALIGNMENT=y CONFIG_IA64_MCA_RECOVERY=y -CONFIG_PERFMON=y +CONFIG_PERFMON_V20=y CONFIG_IA64_PALINFO=y # CONFIG_IA64_MC_ERR_INJECT is not set CONFIG_SGI_SN=y diff --git a/arch/ia64/configs/gensparse_defconfig b/arch/ia64/configs/gensparse_defconfig index e86fbd39c795..5f8c7721e29a 100644 --- a/arch/ia64/configs/gensparse_defconfig +++ b/arch/ia64/configs/gensparse_defconfig @@ -142,7 +142,7 @@ CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y CONFIG_IA32_SUPPORT=y CONFIG_COMPAT=y CONFIG_IA64_MCA_RECOVERY=y -CONFIG_PERFMON=y +CONFIG_PERFMON_V20=y CONFIG_IA64_PALINFO=y CONFIG_SGI_SN=y diff --git a/arch/ia64/configs/sim_defconfig b/arch/ia64/configs/sim_defconfig index 546a772f438e..d51457af7ca6 100644 --- a/arch/ia64/configs/sim_defconfig +++ b/arch/ia64/configs/sim_defconfig @@ -133,7 +133,7 @@ CONFIG_ARCH_SPARSEMEM_ENABLE=y CONFIG_IA32_SUPPORT=y CONFIG_COMPAT=y # CONFIG_IA64_MCA_RECOVERY is not set -# CONFIG_PERFMON is not set +# CONFIG_PERFMON_V20 is not set CONFIG_IA64_PALINFO=m # diff --git a/arch/ia64/configs/tiger_defconfig b/arch/ia64/configs/tiger_defconfig index c522edf23c62..318d846ab253 100644 --- a/arch/ia64/configs/tiger_defconfig +++ b/arch/ia64/configs/tiger_defconfig @@ -156,7 +156,7 @@ CONFIG_VIRTUAL_MEM_MAP=y CONFIG_HOLES_IN_ZONE=y # CONFIG_IA32_SUPPORT is not set CONFIG_IA64_MCA_RECOVERY=y -CONFIG_PERFMON=y +CONFIG_PERFMON_V20=y CONFIG_IA64_PALINFO=y # CONFIG_IA64_MC_ERR_INJECT is not set # CONFIG_IA64_ESI is not set diff --git a/arch/ia64/configs/zx1_defconfig b/arch/ia64/configs/zx1_defconfig index 0a06b1333c95..2bf0ad40398f 100644 --- a/arch/ia64/configs/zx1_defconfig +++ b/arch/ia64/configs/zx1_defconfig @@ -153,7 +153,7 @@ CONFIG_HOLES_IN_ZONE=y CONFIG_IA32_SUPPORT=y CONFIG_COMPAT=y CONFIG_IA64_MCA_RECOVERY=y -CONFIG_PERFMON=y +CONFIG_PERFMON_V20=y CONFIG_IA64_PALINFO=y # CONFIG_IA64_ESI is not set # CONFIG_KEXEC is not set diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h index f88fa054d01d..3ecf7e0b44cb 100644 --- a/arch/ia64/include/asm/processor.h +++ b/arch/ia64/include/asm/processor.h @@ -321,7 +321,7 @@ struct thread_struct { #else # define INIT_THREAD_IA32 #endif /* CONFIG_IA32_SUPPORT */ -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 void *pfm_context; /* pointer to detailed PMU context */ unsigned long pfm_needs_checking; /* when >0, pending perfmon work on kernel exit */ # define INIT_THREAD_PM .pfm_context = NULL, \ diff --git a/arch/ia64/include/asm/system.h b/arch/ia64/include/asm/system.h index 927a381c20ca..387e54030af1 100644 --- a/arch/ia64/include/asm/system.h +++ b/arch/ia64/include/asm/system.h @@ -224,7 +224,7 @@ extern void ia64_account_on_switch (struct task_struct *prev, struct task_struct # define IA64_ACCOUNT_ON_SWITCH(p,n) #endif -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 DECLARE_PER_CPU(unsigned long, pfm_syst_info); # define PERFMON_IS_SYSWIDE() (__get_cpu_var(pfm_syst_info) & 0x1) #else diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index c381ea954892..93819cca7d96 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -23,7 +23,7 @@ obj-$(CONFIG_IOSAPIC) += iosapic.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_SMP) += smp.o smpboot.o obj-$(CONFIG_NUMA) += numa.o -obj-$(CONFIG_PERFMON) += perfmon_default_smpl.o +obj-$(CONFIG_PERFMON_V20) += perfmon_default_smpl.o obj-$(CONFIG_IA64_CYCLONE) += cyclone.o obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c index 28d3d483db92..db54bd497cf6 100644 --- a/arch/ia64/kernel/irq_ia64.c +++ b/arch/ia64/kernel/irq_ia64.c @@ -40,7 +40,7 @@ #include <asm/system.h> #include <asm/tlbflush.h> -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 # include <asm/perfmon.h> #endif @@ -660,7 +660,7 @@ init_IRQ (void) } #endif #endif -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 pfm_init_percpu(); #endif platform_irq_init(); diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 0e499757309b..5f6efcfa2de4 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -52,7 +52,7 @@ #include <asm/uaccess.h> #include <asm/delay.h> -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 /* * perfmon context state */ @@ -6831,10 +6831,10 @@ pfm_inherit(struct task_struct *task, struct pt_regs *regs) * the psr bits are already set properly in copy_threads() */ } -#else /* !CONFIG_PERFMON */ +#else /* !CONFIG_PERFMON_v20 */ asmlinkage long sys_perfmonctl (int fd, int cmd, void *arg, int count) { return -ENOSYS; } -#endif /* CONFIG_PERFMON */ +#endif /* CONFIG_PERFMON_V20 */ diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index c57162705147..afbf1a8205ee 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -46,7 +46,7 @@ #include "entry.h" -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 # include <asm/perfmon.h> #endif @@ -174,7 +174,7 @@ do_notify_resume_user(sigset_t *unused, struct sigscratch *scr, long in_syscall) return; } -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 if (current->thread.pfm_needs_checking) /* * Note: pfm_handle_work() allow us to call it with interrupts @@ -334,14 +334,14 @@ cpu_idle (void) void ia64_save_extra (struct task_struct *task) { -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 unsigned long info; #endif if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0) ia64_save_debug_regs(&task->thread.dbr[0]); -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0) pfm_save_regs(task); @@ -359,14 +359,14 @@ ia64_save_extra (struct task_struct *task) void ia64_load_extra (struct task_struct *task) { -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 unsigned long info; #endif if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0) ia64_load_debug_regs(&task->thread.dbr[0]); -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0) pfm_load_regs(task); @@ -523,7 +523,7 @@ copy_thread (int nr, unsigned long clone_flags, } #endif -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 if (current->thread.pfm_context) pfm_inherit(p, child_ptregs); #endif @@ -735,7 +735,7 @@ exit_thread (void) { ia64_drop_fpu(current); -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 /* if needed, stop monitoring and flush state to perfmon context */ if (current->thread.pfm_context) pfm_exit_thread(current); diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index 92c9689b7d97..ffd212fd2d36 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -31,7 +31,7 @@ #include <asm/system.h> #include <asm/uaccess.h> #include <asm/unwind.h> -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 #include <asm/perfmon.h> #endif @@ -2105,7 +2105,7 @@ access_uarea(struct task_struct *child, unsigned long addr, "address 0x%lx\n", addr); return -1; } -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 /* * Check if debug registers are used by perfmon. This * test must be done once we know that we can do the diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c index 1dcbb85fc4ee..f865315a9248 100644 --- a/arch/ia64/kernel/smpboot.c +++ b/arch/ia64/kernel/smpboot.c @@ -381,7 +381,7 @@ smp_callin (void) extern void ia64_init_itm(void); extern volatile int time_keeper_id; -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 extern void pfm_init_percpu(void); #endif @@ -411,7 +411,7 @@ smp_callin (void) ia64_mca_cmc_vector_setup(); /* Setup vector on AP */ -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 pfm_init_percpu(); #endif diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile index 98771e2a78af..754f4153123e 100644 --- a/arch/ia64/lib/Makefile +++ b/arch/ia64/lib/Makefile @@ -13,7 +13,7 @@ lib-y := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \ obj-$(CONFIG_ITANIUM) += copy_page.o copy_user.o memcpy.o obj-$(CONFIG_MCKINLEY) += copy_page_mck.o memcpy_mck.o -lib-$(CONFIG_PERFMON) += carta_random.o +lib-$(CONFIG_PERFMON_V20) += carta_random.o AFLAGS___divdi3.o = AFLAGS___udivdi3.o = -DUNSIGNED diff --git a/arch/ia64/oprofile/Makefile b/arch/ia64/oprofile/Makefile index aad27a718ee0..3323fd5a46e9 100644 --- a/arch/ia64/oprofile/Makefile +++ b/arch/ia64/oprofile/Makefile @@ -7,4 +7,4 @@ DRIVER_OBJS := $(addprefix ../../../drivers/oprofile/, \ timer_int.o ) oprofile-y := $(DRIVER_OBJS) init.o backtrace.o -oprofile-$(CONFIG_PERFMON) += perfmon.o +oprofile-$(CONFIG_PERFMON_V20) += perfmon.o diff --git a/arch/ia64/oprofile/init.c b/arch/ia64/oprofile/init.c index 31b545c35460..9ed2bc152fba 100644 --- a/arch/ia64/oprofile/init.c +++ b/arch/ia64/oprofile/init.c @@ -20,7 +20,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops) { int ret = -ENODEV; -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 /* perfmon_init() can fail, but we have no way to report it */ ret = perfmon_init(ops); #endif @@ -32,7 +32,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops) void oprofile_arch_exit(void) { -#ifdef CONFIG_PERFMON +#ifdef CONFIG_PERFMON_V20 perfmon_exit(); #endif } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b5e714373385..cdc53491c033 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1516,6 +1516,8 @@ config CMDLINE_OVERRIDE This is used to work around broken boot loaders. This should be set to 'N' under normal conditions. +source "arch/x86/perfmon/Kconfig" + endmenu config ARCH_ENABLE_MEMORY_HOTPLUG diff --git a/arch/x86/Makefile b/arch/x86/Makefile index cf72b569db41..f3af2b0b4f15 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -155,6 +155,9 @@ core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/ core-y += arch/x86/kernel/ core-y += arch/x86/mm/ +# perfmon support +core-$(CONFIG_PERFMON) += arch/x86/perfmon/ + # Remaining sub architecture files core-y += $(mcore-y) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 256b00b61892..891af3e6b3a6 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -826,4 +826,9 @@ ia32_sys_call_table: .quad sys_dup3 /* 330 */ .quad sys_pipe2 .quad sys_inotify_init1 + .quad sys_pfm_create + .quad sys_pfm_write + .quad sys_pfm_read /* 335 */ + .quad sys_pfm_attach + .quad sys_pfm_set_state ia32_syscall_end: diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 4a8e80cdcfa5..15d495f73485 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -10,6 +10,7 @@ header-y += ptrace-abi.h header-y += sigcontext32.h header-y += ucontext.h header-y += processor-flags.h +header-y += perfmon.h unifdef-y += e820.h unifdef-y += ist.h diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 0005adb0f941..0ba6dd3aa24e 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -87,6 +87,11 @@ #define LOCAL_TIMER_VECTOR 0xef /* + * Perfmon PMU interrupt vector + */ +#define LOCAL_PERFMON_VECTOR 0xee + +/* * First APIC vector available to drivers: (vectors 0x30-0xee) we * start at 0x31(0x41) to spread out vectors evenly between priority * levels. (0x80 is the syscall vector) diff --git a/arch/x86/include/asm/mach-default/entry_arch.h b/arch/x86/include/asm/mach-default/entry_arch.h index 6b1add8e31dd..e940722dc1f0 100644 --- a/arch/x86/include/asm/mach-default/entry_arch.h +++ b/arch/x86/include/asm/mach-default/entry_arch.h @@ -33,4 +33,8 @@ BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) #endif +#ifdef CONFIG_PERFMON +BUILD_INTERRUPT(pmu_interrupt,LOCAL_PERFMON_VECTOR) +#endif + #endif diff --git a/arch/x86/include/asm/perfmon.h b/arch/x86/include/asm/perfmon.h new file mode 100644 index 000000000000..906f4b24cf0c --- /dev/null +++ b/arch/x86/include/asm/perfmon.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@hpl.hp.com> + * + * This file contains i386/x86_64 specific definitions for the perfmon + * interface. + * + * This file MUST never be included directly. Use linux/perfmon.h. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef _ASM_X86_PERFMON__H_ +#define _ASM_X86_PERFMON__H_ + +/* + * arch-specific user visible interface definitions + */ + +#define PFM_ARCH_MAX_PMCS (256+64) /* 256 HW 64 SW */ +#define PFM_ARCH_MAX_PMDS (256+64) /* 256 HW 64 SW */ + +#endif /* _ASM_X86_PERFMON_H_ */ diff --git a/arch/x86/include/asm/perfmon_kern.h b/arch/x86/include/asm/perfmon_kern.h new file mode 100644 index 000000000000..7cadbb894e83 --- /dev/null +++ b/arch/x86/include/asm/perfmon_kern.h @@ -0,0 +1,438 @@ +/* + * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@hpl.hp.com> + * + * Copyright (c) 2007 Advanced Micro Devices, Inc. + * Contributed by Robert Richter <robert.richter@amd.com> + * + * This file contains X86 Processor Family specific definitions + * for the perfmon interface. This covers P6, Pentium M, P4/Xeon + * (32-bit and 64-bit, i.e., EM64T) and AMD X86-64. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef _ASM_X86_PERFMON_KERN_H_ +#define _ASM_X86_PERFMON_KERN_H_ + +#ifdef CONFIG_PERFMON +#include <linux/unistd.h> +#ifdef CONFIG_4KSTACKS +#define PFM_ARCH_STK_ARG 8 +#else +#define PFM_ARCH_STK_ARG 16 +#endif + +struct pfm_arch_pmu_info { + u32 flags; /* PMU feature flags */ + /* + * mandatory model-specific callbacks + */ + int (*stop_save)(struct pfm_context *ctx, struct pfm_event_set *set); + int (*has_ovfls)(struct pfm_context *ctx); + void (*quiesce)(void); + + /* + * optional model-specific callbacks + */ + void (*acquire_pmu_percpu)(void); + void (*release_pmu_percpu)(void); + int (*load_context)(struct pfm_context *ctx); + void (*unload_context)(struct pfm_context *ctx); +}; + +/* + * PMU feature flags + */ +#define PFM_X86_FL_NO_SHARING 0x02 /* no sharing with other subsystems */ +#define PFM_X86_FL_SHARING 0x04 /* PMU is being shared */ + +struct pfm_x86_ctx_flags { + unsigned int insecure:1; /* rdpmc per-thread self-monitoring */ + unsigned int reserved:31; /* for future use */ +}; + +struct pfm_arch_context { + u64 saved_real_iip; /* instr pointer of last NMI intr */ + struct pfm_x86_ctx_flags flags; /* flags */ + int saved_started; +}; + +/* + * functions implemented as inline on x86 + */ + +/** + * pfm_arch_write_pmc - write a single PMC register + * @ctx: context to work on + * @cnum: PMC index + * @value: PMC 64-bit value + * + * in certain situations, ctx may be NULL + */ +static inline void pfm_arch_write_pmc(struct pfm_context *ctx, + unsigned int cnum, u64 value) +{ + /* + * we only write to the actual register when monitoring is + * active (pfm_start was issued) + */ + if (ctx && ctx->flags.started == 0) + return; + + PFM_DBG_ovfl("pfm_arch_write_pmc(0x%lx, 0x%Lx)", + pfm_pmu_conf->pmc_desc[cnum].hw_addr, + (unsigned long long) value); + + wrmsrl(pfm_pmu_conf->pmc_desc[cnum].hw_addr, value); +} + +/** + * pfm_arch_write_pmd - write a single PMD register + * @ctx: context to work on + * @cnum: PMD index + * @value: PMD 64-bit value + */ +static inline void pfm_arch_write_pmd(struct pfm_context *ctx, + unsigned int cnum, u64 value) +{ + /* + * to make sure the counter overflows, we set the + * upper bits. we also clear any other unimplemented + * bits as this may cause crash on some processors. + */ + if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_C64) + value = (value | ~pfm_pmu_conf->ovfl_mask) + & ~pfm_pmu_conf->pmd_desc[cnum].rsvd_msk; + + PFM_DBG_ovfl("pfm_arch_write_pmd(0x%lx, 0x%Lx)", + pfm_pmu_conf->pmd_desc[cnum].hw_addr, + (unsigned long long) value); + + wrmsrl(pfm_pmu_conf->pmd_desc[cnum].hw_addr, value); +} + +/** + * pfm_arch_read_pmd - read a single PMD register + * @ctx: context to work on + * @cnum: PMD index + * + * return value is register 64-bit value + */ +static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum) +{ + u64 tmp; + + rdmsrl(pfm_pmu_conf->pmd_desc[cnum].hw_addr, tmp); + + PFM_DBG_ovfl("pfm_arch_read_pmd(0x%lx) = 0x%Lx", + pfm_pmu_conf->pmd_desc[cnum].hw_addr, + (unsigned long long) tmp); + return tmp; +} + +/** + * pfm_arch_read_pmc - read a single PMC register + * @ctx: context to work on + * @cnum: PMC index + * + * return value is register 64-bit value + */ +static inline u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum) +{ + u64 tmp; + + rdmsrl(pfm_pmu_conf->pmc_desc[cnum].hw_addr, tmp); + + PFM_DBG_ovfl("pfm_arch_read_pmc(0x%lx) = 0x%016Lx", + pfm_pmu_conf->pmc_desc[cnum].hw_addr, + (unsigned long long) tmp); + return tmp; +} + +/** + * pfm_arch_is_active - return non-zero is monitoring has been started + * @ctx: context to check + * + * At certain points, perfmon needs to know if monitoring has been + * explicitly started. + * + * On x86, there is not other way but to use pfm_start/pfm_stop + * to activate monitoring, thus we can simply check flags.started + */ +static inline int pfm_arch_is_active(struct pfm_context *ctx) +{ + return ctx->flags.started; +} + + +/** + * pfm_arch_unload_context - detach context from thread or CPU + * @ctx: context to detach + * + * in system-wide ctx->task is NULL, otherwise it points to the + * attached thread + */ +static inline void pfm_arch_unload_context(struct pfm_context *ctx) +{ + struct pfm_arch_pmu_info *pmu_info; + struct pfm_arch_context *ctx_arch; + + ctx_arch = pfm_ctx_arch(ctx); + pmu_info = pfm_pmu_info(); + + if (ctx_arch->flags.insecure) { + PFM_DBG("clear cr4.pce"); + clear_in_cr4(X86_CR4_PCE); + } + + if (pmu_info->unload_context) + pmu_info->unload_context(ctx); +} + +/** + * pfm_arch_load_context - attach context to thread or CPU + * @ctx: context to attach + */ +static inline int pfm_arch_load_context(struct pfm_context *ctx) +{ + struct pfm_arch_pmu_info *pmu_info; + struct pfm_arch_context *ctx_arch; + int ret = 0; + + ctx_arch = pfm_ctx_arch(ctx); + pmu_info = pfm_pmu_info(); + + /* + * RDPMC authorized in system-wide and + * per-thread self-monitoring. + * + * RDPMC only gives access to counts. + * + * The context-switch routine code does not restore + * all the PMD registers (optimization), thus there + * is a possible leak of counts there in per-thread + * mode. + */ + if (ctx->task == current) { + PFM_DBG("set cr4.pce"); + set_in_cr4(X86_CR4_PCE); + ctx_arch->flags.insecure = 1; + } + + if (pmu_info->load_context) + ret = pmu_info->load_context(ctx); + + return ret; +} + +void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set); +void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx); +void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx); + +/** + * pfm_arch_intr_freeze_pmu - stop monitoring when handling PMU interrupt + * @ctx: current context + * @set: current event set + * + * called from __pfm_interrupt_handler(). + * ctx is not NULL. ctx is locked. interrupts are masked + * + * The following actions must take place: + * - stop all monitoring to ensure handler has consistent view. + * - collect overflowed PMDs bitmask into povfls_pmds and + * npend_ovfls. If no interrupt detected then npend_ovfls + * must be set to zero. + */ +static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + struct pfm_arch_context *ctx_arch; + ctx_arch = pfm_ctx_arch(ctx); + /* + * on X86, freezing is equivalent to stopping + */ + pfm_arch_stop(current, ctx); + + /* + * we mark monitoring as stopped to avoid + * certain side effects especially in + * pfm_arch_restore_pmcs() + */ + ctx_arch->saved_started = ctx->flags.started; + ctx->flags.started = 0; +} + +/** + * pfm_arch_intr_unfreeze_pmu - conditionally reactive monitoring + * @ctx: current context + * + * current context may be not when dealing when spurious interrupts + * + * Must re-activate monitoring if context is not MASKED. + * interrupts are masked. + */ +static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx) +{ + struct pfm_arch_context *ctx_arch; + + if (ctx == NULL) + return; + + ctx_arch = pfm_ctx_arch(ctx); + + PFM_DBG_ovfl("state=%d", ctx->state); + + /* + * restore flags.started which is cleared in + * pfm_arch_intr_freeze_pmu() + */ + ctx->flags.started = ctx_arch->saved_started; + + pfm_arch_restore_pmcs(ctx, ctx->active_set); +} + +/** + * pfm_arch_ovfl_reset_pmd - reset pmd on overflow + * @ctx: current context + * @cnum: PMD index + * + * On some CPUs, the upper bits of a counter must be set in order for the + * overflow interrupt to happen. On overflow, the counter has wrapped around, + * and the upper bits are cleared. This function may be used to set them back. + * + * For x86, the current version loses whatever is remaining in the counter, + * which is usually has a small count. In order not to loose this count, + * we do a read-modify-write to set the upper bits while preserving the + * low-order bits. This is slow but works. + */ +static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx, unsigned int cnum) +{ + u64 val; + val = pfm_arch_read_pmd(ctx, cnum); + pfm_arch_write_pmd(ctx, cnum, val); +} + +/** + * pfm_arch_context_create - create context + * @ctx: newly created context + * @flags: context flags as passed by user + * + * called from __pfm_create_context() + */ +static inline int pfm_arch_context_create(struct pfm_context *ctx, u32 ctx_flags) +{ + return 0; +} + +/** + * pfm_arch_context_free - free context + * @ctx: context to free + */ +static inline void pfm_arch_context_free(struct pfm_context *ctx) +{} + +/* + * functions implemented in arch/x86/perfmon/perfmon.c + */ +int pfm_arch_init(void); +void pfm_arch_resend_irq(struct pfm_context *ctx); + +int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx); +void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx); + +void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set); +int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg); +void pfm_arch_pmu_config_remove(void); +char *pfm_arch_get_pmu_module_name(void); +int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds); +void pfm_arch_pmu_release(void); + +static inline void pfm_arch_serialize(void) +{} + +static inline void pfm_arch_arm_handle_work(struct task_struct *task) +{} + +static inline void pfm_arch_disarm_handle_work(struct task_struct *task) +{} + +#define PFM_ARCH_CTX_SIZE (sizeof(struct pfm_arch_context)) +/* + * x86 does not need extra alignment requirements for the sampling buffer + */ +#define PFM_ARCH_SMPL_ALIGN_SIZE 0 + +asmlinkage void pmu_interrupt(void); + +static inline void pfm_arch_bv_copy(u64 *a, u64 *b, int nbits) +{ + bitmap_copy((unsigned long *)a, + (unsigned long *)b, + nbits); +} + +static inline void pfm_arch_bv_or(u64 *a, u64 *b, u64 *c, int nbits) +{ + bitmap_or((unsigned long *)a, + (unsigned long *)b, + (unsigned long *)c, + nbits); +} + +static inline void pfm_arch_bv_and(u64 *a, u64 *b, u64 *c, int nbits) +{ + bitmap_and((unsigned long *)a, + (unsigned long *)b, + (unsigned long *)c, + nbits); +} + + +static inline void pfm_arch_bv_zero(u64 *a, int nbits) +{ + bitmap_zero((unsigned long *)a, nbits); +} + +static inline int pfm_arch_bv_weight(u64 *a, int nbits) +{ + return bitmap_weight((unsigned long *)a, nbits); +} + +static inline void pfm_arch_bv_set_bit(int b, u64 *a) +{ + __set_bit(b, (unsigned long *)a); +} + +static inline void pfm_arch_bv_clear_bit(int b, u64 *a) +{ + __clear_bit(b, (unsigned long *)a); +} + +static inline int pfm_arch_bv_test_bit(int b, u64 *a) +{ + return test_bit(b, (unsigned long *)a); +} + +static inline unsigned long pfm_arch_bv_find_next_bit(const u64 *addr, + unsigned long size, + unsigned long offset) +{ + return find_next_bit((unsigned long *)addr, + size, + offset); +} +#endif /* CONFIG_PEFMON */ + +#endif /* _ASM_X86_PERFMON_KERN_H_ */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e44d379faad2..0ddd534bef44 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -79,6 +79,7 @@ struct thread_info { #define TIF_SYSCALL_EMU 6 /* syscall emulation active */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ +#define TIF_PERFMON_WORK 9 /* work for pfm_handle_work() */ #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* 32bit process */ @@ -92,6 +93,7 @@ struct thread_info { #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ #define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */ +#define TIF_PERFMON_CTXSW 28 /* perfmon needs ctxsw calls */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -114,6 +116,8 @@ struct thread_info { #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) #define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS) +#define _TIF_PERFMON_WORK (1<<TIF_PERFMON_WORK) +#define _TIF_PERFMON_CTXSW (1<<TIF_PERFMON_CTXSW) /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ @@ -135,12 +139,12 @@ struct thread_info { /* Only used for 64 bit */ #define _TIF_DO_NOTIFY_MASK \ - (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME) + (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERFMON_WORK) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \ - _TIF_NOTSC) + _TIF_NOTSC|_TIF_PERFMON_CTXSW) #define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index f2bba78430a4..06908451002f 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -338,6 +338,11 @@ #define __NR_dup3 330 #define __NR_pipe2 331 #define __NR_inotify_init1 332 +#define __NR_pfm_create 333 +#define __NR_pfm_write (__NR_pfm_create+1) +#define __NR_pfm_read (__NR_pfm_create+2) +#define __NR_pfm_attach (__NR_pfm_create+3) +#define __NR_pfm_set_state (__NR_pfm_create+4) #ifdef __KERNEL__ diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 834b2c1d89fb..a42bb5eb9edb 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -653,7 +653,16 @@ __SYSCALL(__NR_dup3, sys_dup3) __SYSCALL(__NR_pipe2, sys_pipe2) #define __NR_inotify_init1 294 __SYSCALL(__NR_inotify_init1, sys_inotify_init1) - +#define __NR_pfm_create 295 +__SYSCALL(__NR_pfm_create, sys_pfm_create) +#define __NR_pfm_write (__NR_pfm_create+1) +__SYSCALL(__NR_pfm_write, sys_pfm_write) +#define __NR_pfm_read (__NR_pfm_create+2) + __SYSCALL(__NR_pfm_read, sys_pfm_read) +#define __NR_pfm_attach (__NR_pfm_create+3) +__SYSCALL(__NR_pfm_attach, sys_pfm_attach) +#define __NR_pfm_set_state (__NR_pfm_create+4) +__SYSCALL(__NR_pfm_set_state, sys_pfm_set_state) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 9134de814c97..9f8826f33032 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -513,7 +513,7 @@ ENDPROC(system_call) ALIGN RING0_PTREGS_FRAME # can't unwind into user space anyway work_pending: - testb $_TIF_NEED_RESCHED, %cl + testw $(_TIF_NEED_RESCHED|_TIF_PERFMON_WORK), %cx jz work_notifysig work_resched: call schedule diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 983d85aeccce..1d9bef0797d9 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -876,7 +876,13 @@ END(error_interrupt) ENTRY(spurious_interrupt) apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt END(spurious_interrupt) - + +#ifdef CONFIG_PERFMON +ENTRY(pmu_interrupt) + apicinterrupt LOCAL_PERFMON_VECTOR,smp_pmu_interrupt +END(pmu_interrupt) +#endif + /* * Exception entry points. */ diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index ff0235391285..24a0140e6c36 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -11,6 +11,7 @@ #include <linux/kernel_stat.h> #include <linux/sysdev.h> #include <linux/bitops.h> +#include <linux/perfmon_kern.h> #include <asm/acpi.h> #include <asm/atomic.h> @@ -224,6 +225,10 @@ void __init native_init_IRQ(void) apic_intr_init(); +#ifdef CONFIG_PERFMON + alloc_intr_gate(LOCAL_PERFMON_VECTOR, pmu_interrupt); +#endif + if (!acpi_ioapic) setup_irq(2, &irq2); } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0a1302fe6d45..7ff71d4d6d9b 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -36,6 +36,7 @@ #include <linux/personality.h> #include <linux/tick.h> #include <linux/percpu.h> +#include <linux/perfmon_kern.h> #include <linux/prctl.h> #include <linux/dmi.h> @@ -258,6 +259,7 @@ void exit_thread(void) ds_free(current->thread.ds_ctx); } #endif /* CONFIG_X86_DS */ + pfm_exit_thread(); } void flush_thread(void) @@ -315,6 +317,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, savesegment(gs, p->thread.gs); + pfm_copy_thread(p); + tsk = current; if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, @@ -458,11 +462,17 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, prev = &prev_p->thread; next = &next_p->thread; + if (test_tsk_thread_flag(prev_p, TIF_PERFMON_CTXSW)) + pfm_ctxsw_out(prev_p, next_p); + debugctl = update_debugctl(prev, next, prev->debugctlmsr); if (next->debugctlmsr != debugctl) update_debugctlmsr(next->debugctlmsr); + if (test_tsk_thread_flag(next_p, TIF_PERFMON_CTXSW)) + pfm_ctxsw_in(prev_p, next_p); + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { set_debugreg(next->debugreg0, 0); set_debugreg(next->debugreg1, 1); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3180e79c3697..86099f98104a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -37,6 +37,7 @@ #include <linux/kprobes.h> #include <linux/kdebug.h> #include <linux/tick.h> +#include <linux/perfmon_kern.h> #include <linux/prctl.h> #include <linux/uaccess.h> #include <linux/io.h> @@ -255,6 +256,7 @@ void exit_thread(void) ds_free(t->ds_ctx); } #endif /* CONFIG_X86_DS */ + pfm_exit_thread(); } void flush_thread(void) @@ -359,6 +361,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); + pfm_copy_thread(p); + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { @@ -487,6 +491,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, prev = &prev_p->thread, next = &next_p->thread; + if (test_tsk_thread_flag(prev_p, TIF_PERFMON_CTXSW)) + pfm_ctxsw_out(prev_p, next_p); + debugctl = prev->debugctlmsr; #ifdef CONFIG_X86_DS @@ -513,6 +520,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, if (next->debugctlmsr != debugctl) update_debugctlmsr(next->debugctlmsr); + if (test_tsk_thread_flag(next_p, TIF_PERFMON_CTXSW)) + pfm_ctxsw_in(prev_p, next_p); + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { loaddebug(next, 0); loaddebug(next, 1); diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 27a5c8174322..7d6fc603dea7 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -19,6 +19,7 @@ #include <linux/wait.h> #include <linux/tracehook.h> #include <linux/elf.h> +#include <linux/perfmon_kern.h> #include <linux/smp.h> #include <linux/mm.h> @@ -749,6 +750,10 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) mce_notify_user(); #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ + /* process perfmon asynchronous work (e.g. block thread or reset) */ + if (thread_info_flags & _TIF_PERFMON_WORK) + pfm_handle_work(regs); + /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index d2307e41fbdb..24e389836fc0 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -21,6 +21,7 @@ #include <linux/personality.h> #include <linux/compiler.h> #include <linux/uaccess.h> +#include <linux/perfmon_kern.h> #include <asm/processor.h> #include <asm/ucontext.h> @@ -538,6 +539,10 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) mce_notify_user(); #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ + /* process perfmon asynchronous work (e.g. block thread or reset) */ + if (thread_info_flags & _TIF_PERFMON_WORK) + pfm_handle_work(regs); + /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index d44395ff34c3..81c22739f70b 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -332,3 +332,8 @@ ENTRY(sys_call_table) .long sys_dup3 /* 330 */ .long sys_pipe2 .long sys_inotify_init1 + .long sys_pfm_create + .long sys_pfm_write + .long sys_pfm_read /* 335 */ + .long sys_pfm_attach + .long sys_pfm_set_state diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 022cd41ea9b4..584a9ef4e44c 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -17,6 +17,7 @@ #include <linux/moduleparam.h> #include <linux/kdebug.h> #include <linux/cpu.h> +#include <linux/perfmon_kern.h> #include <asm/nmi.h> #include <asm/msr.h> #include <asm/apic.h> @@ -142,12 +143,18 @@ static int nmi_setup(void) int err = 0; int cpu; - if (!allocate_msrs()) + if (pfm_session_allcpus_acquire()) + return -EBUSY; + + if (!allocate_msrs()) { + pfm_session_allcpus_release(); return -ENOMEM; + } err = register_die_notifier(&profile_exceptions_nb); if (err) { free_msrs(); + pfm_session_allcpus_release(); return err; } @@ -228,6 +235,7 @@ static void nmi_shutdown(void) msrs = &get_cpu_var(cpu_msrs); model->shutdown(msrs); free_msrs(); + pfm_session_allcpus_release(); put_cpu_var(cpu_msrs); } diff --git a/arch/x86/perfmon/Kconfig b/arch/x86/perfmon/Kconfig new file mode 100644 index 000000000000..8144d1d0d600 --- /dev/null +++ b/arch/x86/perfmon/Kconfig @@ -0,0 +1,33 @@ +menu "Hardware Performance Monitoring support" +config PERFMON + bool "Perfmon2 performance monitoring interface" + select X86_LOCAL_APIC + default n + help + Enables the perfmon2 interface to access the hardware + performance counters. See <http://perfmon2.sf.net/> for + more details. + +config PERFMON_DEBUG + bool "Perfmon debugging" + default n + depends on PERFMON + help + Enables perfmon debugging support + +config X86_PERFMON_INTEL_ARCH + bool "Support for Intel architectural perfmon v1/v2/v3" + depends on PERFMON + default n + help + Enables support for Intel architectural performance counters. + This feature was introduced with Intel Core Solo/Core Duo processors. + +config X86_PERFMON_AMD64 + bool "Support AMD Athlon/Opteron hardware performance counters" + depends on PERFMON + default n + help + Enables support for Athlon/Opterton hardware performance counters. + Support for family 6, 15 and 16 processors. + endmenu diff --git a/arch/x86/perfmon/Makefile b/arch/x86/perfmon/Makefile new file mode 100644 index 000000000000..c0a4ca0da329 --- /dev/null +++ b/arch/x86/perfmon/Makefile @@ -0,0 +1,7 @@ +# +# Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P. +# Contributed by Stephane Eranian <eranian@hpl.hp.com> +# +obj-$(CONFIG_PERFMON) += perfmon.o +obj-$(CONFIG_X86_PERFMON_INTEL_ARCH) += perfmon_intel_arch.o +obj-$(CONFIG_X86_PERFMON_AMD64) += perfmon_amd64.o diff --git a/arch/x86/perfmon/perfmon.c b/arch/x86/perfmon/perfmon.c new file mode 100644 index 000000000000..844f19dc6cb0 --- /dev/null +++ b/arch/x86/perfmon/perfmon.c @@ -0,0 +1,619 @@ +/* + * This file implements the X86 specific support for the perfmon2 interface + * + * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@hpl.hp.com> + * + * Copyright (c) 2007 Advanced Micro Devices, Inc. + * Contributed by Robert Richter <robert.richter@amd.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include <linux/interrupt.h> +#include <linux/perfmon_kern.h> +#include <linux/kprobes.h> +#include <linux/kdebug.h> +#include <linux/nmi.h> + +#include <asm/apic.h> + +DEFINE_PER_CPU(unsigned long, real_iip); +DEFINE_PER_CPU(int, pfm_using_nmi); + +/** + * pfm_arch_ctxswin_thread - thread context switch in + * @task: task switched in + * @ctx: context for the task + * @set: active event set + * + * Called from pfm_ctxsw(). Task is guaranteed to be current. + * set cannot be NULL. Context is locked. Interrupts are masked. + * + * Caller has already restored all PMD and PMC registers, if + * necessary (i.e., lazy restore scheme). + * + * On x86, the only common code just needs to unsecure RDPMC if necessary + * + * On model-specific features, e.g., PEBS, IBS, are taken care of in the + * corresponding PMU description module + */ +void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx) +{ + struct pfm_arch_context *ctx_arch; + + ctx_arch = pfm_ctx_arch(ctx); + + /* + * restore saved real iip + */ + if (ctx->active_set->npend_ovfls) + __get_cpu_var(real_iip) = ctx_arch->saved_real_iip; + + /* + * enable RDPMC on this CPU + */ + if (ctx_arch->flags.insecure) + set_in_cr4(X86_CR4_PCE); +} + +/** + * pfm_arch_ctxswout_thread - context switch out thread + * @task: task switched out + * @ctx : context switched out + * + * Called from pfm_ctxsw(). Task is guaranteed to be current. + * Context is locked. Interrupts are masked. Monitoring may be active. + * PMU access is guaranteed. PMC and PMD registers are live in PMU. + * + * Return: + * non-zero : did not save PMDs (as part of stopping the PMU) + * 0 : saved PMDs (no need to save them in caller) + */ +int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx) +{ + struct pfm_arch_context *ctx_arch; + struct pfm_arch_pmu_info *pmu_info; + + ctx_arch = pfm_ctx_arch(ctx); + pmu_info = pfm_pmu_info(); + + /* + * disable lazy restore of PMCS on ctxswin because + * we modify some of them. + */ + ctx->active_set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS; + + if (ctx->active_set->npend_ovfls) + ctx_arch->saved_real_iip = __get_cpu_var(real_iip); + + /* + * disable RDPMC on this CPU + */ + if (ctx_arch->flags.insecure) + clear_in_cr4(X86_CR4_PCE); + + return pmu_info->stop_save(ctx, ctx->active_set); +} + +/** + * pfm_arch_stop - deactivate monitoring + * @task: task to stop + * @ctx: context to stop + * + * Called from pfm_stop() + * Interrupts are masked. Context is locked. Set is the active set. + * + * For per-thread: + * task is not necessarily current. If not current task, then + * task is guaranteed stopped and off any cpu. Access to PMU + * is not guaranteed. + * + * must disable active monitoring. ctx cannot be NULL + */ +void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx) +{ + struct pfm_arch_pmu_info *pmu_info; + + pmu_info = pfm_pmu_info(); + + /* + * no need to go through stop_save() + * if we are already stopped + */ + if (!ctx->flags.started) + return; + + if (task != current) + return; + + pmu_info->stop_save(ctx, ctx->active_set); +} + + +/** + * pfm_arch_start - activate monitoring + * @task: task to start + * @ctx: context to stop + * + * Interrupts are masked. Context is locked. + * + * For per-thread: + * Task is not necessarily current. If not current task, then task + * is guaranteed stopped and off any cpu. No access to PMU is task + * is not current. + */ +void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx) +{ + /* + * cannot restore PMC if no access to PMU. Will be done + * when the thread is switched back in + */ + if (task != current) + return; + + pfm_arch_restore_pmcs(ctx, ctx->active_set); +} + +/** + * pfm_arch_restore_pmds - reload PMD registers + * @ctx: context to restore from + * @set: current event set + * + * function called from pfm_context_load(), pfm_ctxsw() + * + * Context is locked. Interrupts are masked. Set cannot be NULL. + * Access to the PMU is guaranteed. + */ +void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set) +{ + u16 i, num; + + num = set->nused_pmds; + + /* + * we can restore only the PMD we use because: + * + * - can only read with pfm_read_pmds() the registers + * declared used via pfm_write_pmds() + * + * - if cr4.pce=1, only counters are exposed to user. RDPMC + * does not work with other types of PMU registers.Thus, no + * address is ever exposed by counters + * + * - there is never a dependency between one pmd register and + * another + */ + for (i = 0; num; i++) { + if (likely(pfm_arch_bv_test_bit(i, set->used_pmds))) { + pfm_write_pmd(ctx, i, set->pmds[i]); + num--; + } + } +} + +/** + * pfm_arch_restore_pmcs - reload PMC registers + * @ctx: context to restore from + * @set: current event set + * + * function called from pfm_context_load(), pfm_ctxsw(). + * + * Context is locked. Interrupts are masked. set cannot be NULL. + * Access to the PMU is guaranteed. + * + * function must restore all PMC registers from set + */ +void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set) +{ + u16 i, num; + + /* + * we need to restore PMCs only when: + * - context is not masked + * - monitoring activated + * + * Masking monitoring after an overflow does not change the + * value of flags.started + */ + if (!ctx->flags.started) + return; + + /* + * restore all pmcs + * + * It is not possible to restore only the pmcs we used because + * certain PMU models (e.g. Pentium 4) have dependencies. Thus + * we do not want one application using stale PMCs coming from + * another one. + * + * On PMU models where there is no dependencies between PMCs, then + * it is possible to optimize by only restoring the registers that + * are used, but this has to be done by model-specific code. + */ + num = ctx->regs.num_pmcs; + for (i = 0; num; i++) { + if (pfm_arch_bv_test_bit(i, ctx->regs.pmcs)) { + pfm_arch_write_pmc(ctx, i, set->pmcs[i]); + num--; + } + } +} + +/** + * smp_pmu_interrupt - lowest level PMU interrupt handler for X86 + * @regs: machine state + * + * The PMU interrupt is handled through an interrupt gate, therefore + * the CPU automatically clears the EFLAGS.IF, i.e., masking interrupts. + * + * The perfmon interrupt handler MUST run with interrupts disabled due + * to possible race with other, higher priority interrupts, such as timer + * or IPI function calls. + * + * See description in IA-32 architecture manual, Vol 3 section 5.8.1 + */ +void smp_pmu_interrupt(struct pt_regs *regs) +{ + unsigned long iip; + int using_nmi; + + using_nmi = __get_cpu_var(pfm_using_nmi); + + ack_APIC_irq(); + + irq_enter(); + + /* + * when using NMI, pfm_handle_nmi() gets called + * first. It stops monitoring and record the + * iip into real_iip, then it repost the interrupt + * using the lower priority vector LOCAL_PERFMON_VECTOR + * + * On some processors, e.g., P4, it may be that some + * state is already recorded from pfm_handle_nmi() + * and it only needs to be copied back into the normal + * fields so it can be used transparently by higher level + * code. + */ + if (using_nmi) + iip = __get_cpu_var(real_iip); + else + iip = instruction_pointer(regs); + + pfm_interrupt_handler(iip, regs); + + /* + * On Intel processors: + * - it is necessary to clear the MASK field for the LVTPC + * vector. Otherwise interrupts remain masked. See + * section 8.5.1 + * AMD X86-64: + * - the documentation does not stipulate the behavior but + * it seems to work without the write, so we skip + */ + if (!using_nmi && current_cpu_data.x86_vendor == X86_VENDOR_INTEL) + apic_write(APIC_LVTPC, LOCAL_PERFMON_VECTOR); + + irq_exit(); +} + +/** + * pfm_handle_nmi - PMU NMI handler notifier callback + * @nb ; notifier block + * @val: type of die notifier + * @data: die notifier-specific data + * + * called from notify_die() notifier from an trap handler path. We only + * care about NMI related callbacks, and ignore everything else. + * + * Cannot grab any locks, include the perfmon context lock + * + * Must detect if NMI interrupt comes from perfmon, and if so it must + * stop the PMU and repost a lower-priority interrupt. The perfmon interrupt + * handler needs to grab the context lock, thus is cannot be run directly + * from the NMI interrupt call path. + */ +static int __kprobes pfm_handle_nmi(struct notifier_block *nb, + unsigned long val, + void *data) +{ + struct die_args *args = data; + struct pfm_context *ctx; + struct pfm_arch_pmu_info *pmu_info; + + /* + * only NMI related calls + */ + if (val != DIE_NMI_IPI) + return NOTIFY_DONE; + + /* + * perfmon not using NMI + */ + if (!__get_cpu_var(pfm_using_nmi)) + return NOTIFY_DONE; + + /* + * No context + */ + ctx = __get_cpu_var(pmu_ctx); + if (!ctx) { + PFM_DBG_ovfl("no ctx"); + return NOTIFY_DONE; + } + + /* + * Detect if we have overflows, i.e., NMI interrupt + * caused by PMU + */ + pmu_info = pfm_pmu_info(); + if (!pmu_info->has_ovfls(ctx)) { + PFM_DBG_ovfl("no ovfl"); + return NOTIFY_DONE; + } + + /* + * we stop the PMU to avoid further overflow before this + * one is treated by lower priority interrupt handler + */ + pmu_info->quiesce(); + + /* + * record actual instruction pointer + */ + __get_cpu_var(real_iip) = instruction_pointer(args->regs); + + /* + * post lower priority interrupt (LOCAL_PERFMON_VECTOR) + */ + pfm_arch_resend_irq(ctx); + + /* + * we need to rewrite the APIC vector on Intel + */ + if (current_cpu_data.x86_vendor == X86_VENDOR_INTEL) + apic_write(APIC_LVTPC, APIC_DM_NMI); + + /* + * the notification was for us + */ + return NOTIFY_STOP; +} + +static struct notifier_block pfm_nmi_nb = { + .notifier_call = pfm_handle_nmi +}; + +/** + * pfm_arch_resend_irq - post perfmon interrupt on regular vector + * + * called from pfm_ctxswin_thread() and pfm_handle_nmi() + */ +void pfm_arch_resend_irq(struct pfm_context *ctx) +{ + unsigned long val, dest; + /* + * we cannot use hw_resend_irq() because it goes to + * the I/O APIC. We need to go to the Local APIC. + * + * The "int vec" is not the right solution either + * because it triggers a software intr. We need + * to regenerate the interrupt and have it pended + * until we unmask interrupts. + * + * Instead we send ourself an IPI on the perfmon + * vector. + */ + val = APIC_DEST_SELF|APIC_INT_ASSERT| + APIC_DM_FIXED|LOCAL_PERFMON_VECTOR; + + dest = apic_read(APIC_ID); + apic_write(APIC_ICR2, dest); + apic_write(APIC_ICR, val); +} + +/** + * pfm_arch_pmu_acquire_percpu - setup APIC per CPU + * @data: contains pmu flags + */ +static void pfm_arch_pmu_acquire_percpu(void *data) +{ + struct pfm_arch_pmu_info *pmu_info; + unsigned int tmp, vec; + unsigned long flags = (unsigned long)data; + unsigned long lvtpc; + + pmu_info = pfm_pmu_conf->pmu_info; + /* + * we only reprogram the LVTPC vector if we have detected + * no sharing, otherwise it means the APIC is already programmed + * and we use whatever vector (likely NMI) is there + */ + if (!(flags & PFM_X86_FL_SHARING)) { + vec = LOCAL_PERFMON_VECTOR; + + tmp = apic_read(APIC_LVTERR); + apic_write(APIC_LVTERR, tmp | APIC_LVT_MASKED); + apic_write(APIC_LVTPC, vec); + apic_write(APIC_LVTERR, tmp); + } + lvtpc = (unsigned long)apic_read(APIC_LVTPC); + + __get_cpu_var(pfm_using_nmi) = lvtpc == APIC_DM_NMI; + + PFM_DBG("LTVPC=0x%lx using_nmi=%d", + lvtpc, __get_cpu_var(pfm_using_nmi)); + /* + * invoke model specific acquire routine. + */ + if (pmu_info->acquire_pmu_percpu) + pmu_info->acquire_pmu_percpu(); +} + +/** + * pfm_arch_pmu_acquire - acquire PMU resource from system + * @unavail_pmcs : bitmask to use to set unavailable pmcs + * @unavail_pmds : bitmask to use to set unavailable pmds + * + * interrupts are not masked + * + * Grab PMU registers from lower level MSR allocator + * + * Program the APIC according the possible interrupt vector + * either LOCAL_PERFMON_VECTOR or NMI + */ +int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds) +{ + struct pfm_arch_pmu_info *pmu_info; + struct pfm_regmap_desc *d; + u16 i, nlost; + + pmu_info = pfm_pmu_conf->pmu_info; + pmu_info->flags &= ~PFM_X86_FL_SHARING; + + nlost = 0; + + d = pfm_pmu_conf->pmc_desc; + for (i = 0; i < pfm_pmu_conf->num_pmc_entries; i++, d++) { + if (!(d->type & PFM_REG_I)) + continue; + + /* + * reserve register with lower-level allocator + */ + if (!reserve_evntsel_nmi(d->hw_addr)) { + PFM_DBG("pmc%d(%s) already used", i, d->desc); + pfm_arch_bv_set_bit(i, unavail_pmcs); + nlost++; + continue; + } + } + PFM_DBG("nlost=%d info_flags=0x%x\n", nlost, pmu_info->flags); + /* + * some PMU models (e.g., P6) do not support sharing + * so check if we found less than the expected number of PMC registers + */ + if (nlost) { + if (pmu_info->flags & PFM_X86_FL_NO_SHARING) { + PFM_INFO("PMU already used by another subsystem, " + "PMU does not support sharing, " + "try disabling Oprofile or " + "reboot with nmi_watchdog=0"); + goto undo; + } + pmu_info->flags |= PFM_X86_FL_SHARING; + } + + d = pfm_pmu_conf->pmd_desc; + for (i = 0; i < pfm_pmu_conf->num_pmd_entries; i++, d++) { + if (!(d->type & PFM_REG_I)) + continue; + + if (!reserve_perfctr_nmi(d->hw_addr)) { + PFM_DBG("pmd%d(%s) already used", i, d->desc); + pfm_arch_bv_set_bit(i, unavail_pmds); + } + } + /* + * program APIC on each CPU + */ + on_each_cpu(pfm_arch_pmu_acquire_percpu, + (void *)(unsigned long)pmu_info->flags , 1); + + return 0; +undo: + /* + * must undo reservation of pmcs in case of error + */ + d = pfm_pmu_conf->pmc_desc; + for (i = 0; i < pfm_pmu_conf->num_pmc_entries; i++, d++) { + if (!(d->type & PFM_REG_I)) + continue; + if (!pfm_arch_bv_test_bit(i, unavail_pmcs)) + release_evntsel_nmi(d->hw_addr); + } + return -EBUSY; +} + +/** + * pfm-arch_pmu_release_percpu - clear NMI state for one CPU + * + */ +static void pfm_arch_pmu_release_percpu(void *data) +{ + struct pfm_arch_pmu_info *pmu_info; + + pmu_info = pfm_pmu_conf->pmu_info; + + __get_cpu_var(pfm_using_nmi) = 0; + /* + * invoke model specific release routine. + */ + if (pmu_info->release_pmu_percpu) + pmu_info->release_pmu_percpu(); +} + +/** + * pfm_arch_pmu_release - release PMU resource to system + * + * called from pfm_pmu_release() + * interrupts are not masked + * + * On x86, we return the PMU registers to the MSR allocator + */ +void pfm_arch_pmu_release(void) +{ + struct pfm_regmap_desc *d; + u16 i, n; + + d = pfm_pmu_conf->pmc_desc; + n = pfm_pmu_conf->regs_all.num_pmcs; + for (i = 0; n; i++, d++) { + if (!pfm_arch_bv_test_bit(i, pfm_pmu_conf->regs_all.pmcs)) + continue; + release_evntsel_nmi(d->hw_addr); + n--; + PFM_DBG("pmc%u released", i); + } + d = pfm_pmu_conf->pmd_desc; + n = pfm_pmu_conf->regs_all.num_pmds; + for (i = 0; n; i++, d++) { + if (!pfm_arch_bv_test_bit(i, pfm_pmu_conf->regs_all.pmds)) + continue; + release_perfctr_nmi(d->hw_addr); + n--; + PFM_DBG("pmd%u released", i); + } + + /* clear NMI variable if used */ + if (__get_cpu_var(pfm_using_nmi)) + on_each_cpu(pfm_arch_pmu_release_percpu, NULL , 1); +} + +/** + * pfm_arch_init - one time global arch-specific initialization + * + * called from pfm_init() + */ +int __init pfm_arch_init(void) +{ + /* + * we need to register our NMI handler when the kernels boots + * to avoid a deadlock condition with the NMI watchdog or Oprofile + * if we were to try and register/unregister on-demand. + */ + register_die_notifier(&pfm_nmi_nb); + return 0; +} diff --git a/arch/x86/perfmon/perfmon_amd64.c b/arch/x86/perfmon/perfmon_amd64.c new file mode 100644 index 000000000000..f078fe28137d --- /dev/null +++ b/arch/x86/perfmon/perfmon_amd64.c @@ -0,0 +1,483 @@ +/* + * This file contains the PMU description for the Athlon64 and Opteron64 + * processors. It supports 32 and 64-bit modes. + * + * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@hpl.hp.com> + * + * Copyright (c) 2007 Advanced Micro Devices, Inc. + * Contributed by Robert Richter <robert.richter@amd.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include <linux/kprobes.h> +#include <linux/vmalloc.h> +#include <linux/topology.h> +#include <linux/pci.h> +#include <linux/perfmon_kern.h> +#include <asm/hw_irq.h> +#include <asm/apic.h> + +static void __kprobes pfm_amd64_quiesce(void); +static int pfm_amd64_has_ovfls(struct pfm_context *ctx); +static int pfm_amd64_stop_save(struct pfm_context *ctx, + struct pfm_event_set *set); + +static u64 enable_mask[PFM_MAX_PMCS]; +static u16 max_enable; + +static struct pfm_arch_pmu_info pfm_amd64_pmu_info = { + .stop_save = pfm_amd64_stop_save, + .has_ovfls = pfm_amd64_has_ovfls, + .quiesce = pfm_amd64_quiesce, +}; + +/* + * force Local APIC interrupt on overflow + */ +#define PFM_K8_VAL (1ULL<<20) +#define PFM_K8_NO64 (1ULL<<20) + +/* + * reserved bits must be 1 + * + * for family 15: + * - upper 32 bits are reserved + * - bit 20, bit 21 + * + * for family 16: + * - bits 36-39 are reserved + * - bits 42-63 are reserved + * - bit 20, bit 21 + * + */ +#define PFM_K8_RSVD ((~((1ULL<<32)-1)) | (1ULL<<20) | (1ULL<<21)) +#define PFM_16_RSVD ((0x3fffffULL<<42) | (0xfULL<<36) | (1ULL<<20) | (1ULL<<21)) + +static struct pfm_regmap_desc pfm_amd64_pmc_desc[] = { +/* pmc0 */ PMC_D(PFM_REG_I64, "PERFSEL0", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL0), +/* pmc1 */ PMC_D(PFM_REG_I64, "PERFSEL1", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL1), +/* pmc2 */ PMC_D(PFM_REG_I64, "PERFSEL2", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL2), +/* pmc3 */ PMC_D(PFM_REG_I64, "PERFSEL3", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL3), +}; +#define PFM_AMD_NUM_PMCS ARRAY_SIZE(pfm_amd64_pmc_desc) + +/* + * AMD64 counters are 48 bits, upper bits are reserved + */ +#define PFM_AMD64_CTR_RSVD (~((1ULL<<48)-1)) + +#define PFM_AMD_D(n) \ + { .type = PFM_REG_C, \ + .desc = "PERFCTR"#n, \ + .hw_addr = MSR_K7_PERFCTR0+n, \ + .rsvd_msk = PFM_AMD64_CTR_RSVD, \ + .dep_pmcs[0] = 1ULL << n \ + } + +static struct pfm_regmap_desc pfm_amd64_pmd_desc[] = { +/* pmd0 */ PFM_AMD_D(0), +/* pmd1 */ PFM_AMD_D(1), +/* pmd2 */ PFM_AMD_D(2), +/* pmd3 */ PFM_AMD_D(3) +}; +#define PFM_AMD_NUM_PMDS ARRAY_SIZE(pfm_amd64_pmd_desc) + +static struct pfm_context *pfm_nb_task_owner; + +static struct pfm_pmu_config pfm_amd64_pmu_conf; + +/** + * pfm_amd64_acquire_nb -- ensure mutual exclusion for Northbridge events + * @ctx: context to use + * + * There can only be one user per socket for the Northbridge (NB) events, + * so we enforce mutual exclusion as follows: + * - per-thread : only one context machine-wide can use NB events + * + * Exclusion is enforced at: + * - pfm_load_context() + * - pfm_write_pmcs() for attached contexts + * + * Exclusion is released at: + * - pfm_unload_context() or any calls that implicitely uses it + * + * return: + * 0 : successfully acquire NB access + * < 0: errno, failed to acquire NB access + */ +static int pfm_amd64_acquire_nb(struct pfm_context *ctx) +{ + struct pfm_context **entry, *old; + int proc_id; + +#ifdef CONFIG_SMP + proc_id = cpu_data(smp_processor_id()).phys_proc_id; +#else + proc_id = 0; +#endif + + entry = &pfm_nb_task_owner; + + old = cmpxchg(entry, NULL, ctx); + if (!old) { + PFM_DBG("acquired Northbridge event access globally"); + } else if (old != ctx) { + PFM_DBG("global NorthBridge event conflict"); + return -EBUSY; + } + return 0; +} + +/** + * pfm_amd64_pmc_write_check -- check validity of pmc writes + * @ctx: context to use + * @set: event set to use + * @req: user request to modify the pmc + * + * invoked from pfm_write_pmcs() when pfm_nb_sys_owners is not NULL,i.e., + * when we have detected a multi-core processor. + * + * context is locked, interrupts are masked + */ +static int pfm_amd64_pmc_write_check(struct pfm_context *ctx, + struct pfm_event_set *set, + struct pfarg_pmr *req) +{ + unsigned int event; + + /* + * delay checking NB event until we load the context + */ + if (ctx->state == PFM_CTX_UNLOADED) + return 0; + + /* + * check event is NB event + */ + event = (unsigned int)(req->reg_value & 0xff); + if (event < 0xee) + return 0; + + return pfm_amd64_acquire_nb(ctx); +} + +/** + * pfm_amd64_load_context - amd64 model-specific load callback + * @ctx: context to use + * + * invoked on pfm_load_context(). + * context is locked, interrupts are masked + */ +static int pfm_amd64_load_context(struct pfm_context *ctx) +{ + struct pfm_event_set *set; + unsigned int i, n; + + set = ctx->active_set; + n = set->nused_pmcs; + for (i = 0; n; i++) { + if (!pfm_arch_bv_test_bit(i, set->used_pmcs)) + continue; + + if ((set->pmcs[i] & 0xff) >= 0xee) + goto found; + n--; + } + return 0; +found: + return pfm_amd64_acquire_nb(ctx); +} + +/** + * pfm_amd64_unload_context -- amd64 mdoels-specific unload callback + * @ctx: context to use + * + * invoked on pfm_unload_context() + */ +static void pfm_amd64_unload_context(struct pfm_context *ctx) +{ + struct pfm_context **entry, *old; + int proc_id; + +#ifdef CONFIG_SMP + proc_id = cpu_data(smp_processor_id()).phys_proc_id; +#else + proc_id = 0; +#endif + + entry = &pfm_nb_task_owner; + + old = cmpxchg(entry, ctx, NULL); + if (old == ctx) + PFM_DBG("released NorthBridge events globally"); +} + +/** + * pfm_amd64_setup_nb_event_ctrl -- initialize NB event controls + * + * detect if we need to activate NorthBridge event access control + */ +static int pfm_amd64_setup_nb_event_ctrl(void) +{ + unsigned int c, n = 0; + unsigned int max_phys = 0; + +#ifdef CONFIG_SMP + for_each_possible_cpu(c) { + if (cpu_data(c).phys_proc_id > max_phys) + max_phys = cpu_data(c).phys_proc_id; + } +#else + max_phys = 0; +#endif + if (max_phys > 255) { + PFM_INFO("socket id %d is too big to handle", max_phys); + return -ENOMEM; + } + + n = max_phys + 1; + if (n < 2) + return 0; + + pfm_nb_task_owner = NULL; + + /* + * activate write-checker for PMC registers + */ + for (c = 0; c < PFM_AMD_NUM_PMCS; c++) + pfm_amd64_pmc_desc[c].type |= PFM_REG_WC; + + pfm_amd64_pmu_info.load_context = pfm_amd64_load_context; + pfm_amd64_pmu_info.unload_context = pfm_amd64_unload_context; + + pfm_amd64_pmu_conf.pmc_write_check = pfm_amd64_pmc_write_check; + + PFM_INFO("NorthBridge event access control enabled"); + + return 0; +} + +/** + * pfm_amd64_setup_register -- initialize register table + * + * modify register table based on actual host CPU + */ +static void pfm_amd64_setup_registers(void) +{ + u16 i; + + pfm_arch_bv_set_bit(0, enable_mask); + pfm_arch_bv_set_bit(1, enable_mask); + pfm_arch_bv_set_bit(2, enable_mask); + pfm_arch_bv_set_bit(3, enable_mask); + max_enable = 3+1; + + /* + * adjust reserved bit fields for family 16 + */ + if (current_cpu_data.x86 == 16) { + for (i = 0; i < PFM_AMD_NUM_PMCS; i++) + if (pfm_amd64_pmc_desc[i].rsvd_msk == PFM_K8_RSVD) + pfm_amd64_pmc_desc[i].rsvd_msk = PFM_16_RSVD; + } +} + +/** + * pfm_amd64_probe_pmu -- detect host PMU + */ +static int pfm_amd64_probe_pmu(void) +{ + if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) + return -1; + + switch (current_cpu_data.x86) { + case 6: + case 15: + case 16: + PFM_INFO("found family=%d", current_cpu_data.x86); + break; + default: + PFM_INFO("unsupported family=%d", current_cpu_data.x86); + return -1; + } + + /* + * check for local APIC (required) + */ + if (!cpu_has_apic) { + PFM_INFO("no local APIC, unsupported"); + return -1; + } + + if (current_cpu_data.x86_max_cores > 1 + && pfm_amd64_setup_nb_event_ctrl()) + return -1; + + pfm_amd64_setup_registers(); + + return 0; +} + +/** + * pfm_amd64_has_ovfls -- detect if pending overflows + * @ctx: context to use + * + * detect is counters have overflowed. + * return: + * 0 : no overflow + * 1 : at least one overflow + */ +static int __kprobes pfm_amd64_has_ovfls(struct pfm_context *ctx) +{ + struct pfm_regmap_desc *xrd; + u64 *cnt_mask; + u64 wmask, val; + u16 i, num; + + /* + * Check regular counters + */ + cnt_mask = ctx->regs.cnt_pmds; + num = ctx->regs.num_counters; + wmask = 1ULL << pfm_pmu_conf->counter_width; + xrd = pfm_amd64_pmd_desc; + + for (i = 0; num; i++) { + if (pfm_arch_bv_test_bit(i, cnt_mask)) { + rdmsrl(xrd[i].hw_addr, val); + if (!(val & wmask)) + return 1; + num--; + } + } + return 0; +} + +/** + * pfm_amd64_stop_save - stop monitoring, collect pending overflows + * @ctx: context to use + * @set: event set to stop + * + * interrupts are masked, PMU access guaranteed + */ +static int pfm_amd64_stop_save(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + struct pfm_arch_pmu_info *pmu_info; + u64 used_mask[PFM_PMC_BV]; + u64 *cnt_pmds; + u64 val, wmask, ovfl_mask; + u32 i, count; + + pmu_info = pfm_pmu_info(); + + wmask = 1ULL << pfm_pmu_conf->counter_width; + + pfm_arch_bv_and(used_mask, + set->used_pmcs, + enable_mask, + max_enable); + + count = pfm_arch_bv_weight(used_mask, max_enable); + + /* + * stop monitoring + * Unfortunately, this is very expensive! + * wrmsrl() is serializing. + */ + for (i = 0; count; i++) { + if (pfm_arch_bv_test_bit(i, used_mask)) { + wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0); + count--; + } + } + + /* + * if we already having a pending overflow condition, we simply + * return to take care of this first. + */ + if (set->npend_ovfls) + return 1; + + ovfl_mask = pfm_pmu_conf->ovfl_mask; + cnt_pmds = ctx->regs.cnt_pmds; + + /* + * check for pending overflows and save PMDs (combo) + * we employ used_pmds because we also need to save + * and not just check for pending interrupts. + */ + count = set->nused_pmds; + for (i = 0; count; i++) { + if (pfm_arch_bv_test_bit(i, set->used_pmds)) { + val = pfm_arch_read_pmd(ctx, i); + if (likely(pfm_arch_bv_test_bit(i, cnt_pmds))) { + if (!(val & wmask)) { + pfm_arch_bv_set_bit(i,set->povfl_pmds); + set->npend_ovfls++; + } + val = (set->pmds[i] & ~ovfl_mask) + | (val & ovfl_mask); + } + set->pmds[i] = val; + count--; + } + } + /* 0 means: no need to save PMDs at upper level */ + return 0; +} + +/** + * pfm_amd64_quiesce_pmu -- stop monitoring without grabbing any lock + * + * called from NMI interrupt handler to immediately stop monitoring + * cannot grab any lock, including perfmon related locks + */ +static void __kprobes pfm_amd64_quiesce(void) +{ + /* + * quiesce PMU by clearing available registers that have + * the start/stop capability + */ + if (pfm_arch_bv_test_bit(0, pfm_pmu_conf->regs_all.pmcs)) + wrmsrl(MSR_K7_EVNTSEL0, 0); + if (pfm_arch_bv_test_bit(1, pfm_pmu_conf->regs_all.pmcs)) + wrmsrl(MSR_K7_EVNTSEL0+1, 0); + if (pfm_arch_bv_test_bit(2, pfm_pmu_conf->regs_all.pmcs)) + wrmsrl(MSR_K7_EVNTSEL0+2, 0); + if (pfm_arch_bv_test_bit(3, pfm_pmu_conf->regs_all.pmcs)) + wrmsrl(MSR_K7_EVNTSEL0+3, 0); +} + +static struct pfm_pmu_config pfm_amd64_pmu_conf = { + .pmu_name = "AMD64", + .counter_width = 47, + .pmd_desc = pfm_amd64_pmd_desc, + .pmc_desc = pfm_amd64_pmc_desc, + .num_pmc_entries = PFM_AMD_NUM_PMCS, + .num_pmd_entries = PFM_AMD_NUM_PMDS, + .version = "1.2", + .pmu_info = &pfm_amd64_pmu_info +}; + +static int __init pfm_amd64_pmu_init_module(void) +{ + if (pfm_amd64_probe_pmu()) + return -ENOSYS; + return pfm_pmu_register(&pfm_amd64_pmu_conf); +} + +device_initcall(pfm_amd64_pmu_init_module); diff --git a/arch/x86/perfmon/perfmon_intel_arch.c b/arch/x86/perfmon/perfmon_intel_arch.c new file mode 100644 index 000000000000..ce4293dcfcda --- /dev/null +++ b/arch/x86/perfmon/perfmon_intel_arch.c @@ -0,0 +1,628 @@ +/* + * This file contains the Intel architectural perfmon v1, v2, v3 + * description tables. + * + * Architectural perfmon was introduced with Intel Core Solo/Duo + * processors. + * + * Copyright (c) 2006-2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@hpl.hp.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include <linux/kprobes.h> +#include <linux/perfmon_kern.h> +#include <asm/msr.h> +#include <asm/apic.h> + +static u64 enable_mask[PFM_MAX_PMCS]; +static u16 max_enable; +static int pfm_intel_arch_version; + +DEFINE_PER_CPU(u64, saved_global_ctrl); + +/* + * - upper 32 bits are reserved + * - INT: APIC enable bit is reserved (forced to 1) + * - bit 21 is reserved + * + * RSVD: reserved bits are 1 + */ +#define PFM_IA_PMC_RSVD ((~((1ULL<<32)-1)) \ + | (1ULL<<20) \ + | (1ULL<<21)) + +/* + * force Local APIC interrupt on overflow + * disable with NO_EMUL64 + */ +#define PFM_IA_PMC_VAL (1ULL<<20) +#define PFM_IA_NO64 (1ULL<<20) + +/* + * architectuture specifies that: + * IA32_PMCx MSR : starts at 0x0c1 & occupy a contiguous block of MSR + * IA32_PERFEVTSELx MSR : starts at 0x186 & occupy a contiguous block of MSR + * MSR_GEN_FIXED_CTR0 : starts at 0x309 & occupy a contiguous block of MSR + */ +#define MSR_GEN_SEL_BASE MSR_P6_EVNTSEL0 +#define MSR_GEN_PMC_BASE MSR_P6_PERFCTR0 +#define MSR_GEN_FIXED_PMC_BASE MSR_CORE_PERF_FIXED_CTR0 + +/* + * layout of EAX for CPUID.0xa leaf function + */ +struct pmu_eax { + unsigned int version:8; /* architectural perfmon version */ + unsigned int num_cnt:8; /* number of generic counters */ + unsigned int cnt_width:8; /* width of generic counters */ + unsigned int ebx_length:8; /* number of architected events */ +}; + +/* + * layout of EDX for CPUID.0xa leaf function when perfmon v2 is detected + */ +struct pmu_edx { + unsigned int num_cnt:5; /* number of fixed counters */ + unsigned int cnt_width:8; /* width of fixed counters */ + unsigned int reserved:19; +}; + +static void pfm_intel_arch_acquire_pmu_percpu(void); +static void pfm_intel_arch_release_pmu_percpu(void); +static int pfm_intel_arch_stop_save(struct pfm_context *ctx, + struct pfm_event_set *set); +static int pfm_intel_arch_has_ovfls(struct pfm_context *ctx); +static void __kprobes pfm_intel_arch_quiesce(void); + +/* + * physical addresses of MSR controlling the perfevtsel and counter registers + */ +struct pfm_arch_pmu_info pfm_intel_arch_pmu_info = { + .stop_save = pfm_intel_arch_stop_save, + .has_ovfls = pfm_intel_arch_has_ovfls, + .quiesce = pfm_intel_arch_quiesce, + .acquire_pmu_percpu = pfm_intel_arch_acquire_pmu_percpu, + .release_pmu_percpu = pfm_intel_arch_release_pmu_percpu +}; + +#define PFM_IA_C(n) { \ + .type = PFM_REG_I64, \ + .desc = "PERFEVTSEL"#n, \ + .dfl_val = PFM_IA_PMC_VAL, \ + .rsvd_msk = PFM_IA_PMC_RSVD, \ + .no_emul64_msk = PFM_IA_NO64, \ + .hw_addr = MSR_GEN_SEL_BASE+(n) \ + } + +#define PFM_IA_D(n) \ + { .type = PFM_REG_C, \ + .desc = "PMC"#n, \ + .hw_addr = MSR_P6_PERFCTR0+n, \ + .dep_pmcs[0] = 1ULL << n \ + } + +#define PFM_IA_FD(n) \ + { .type = PFM_REG_C, \ + .desc = "FIXED_CTR"#n, \ + .hw_addr = MSR_CORE_PERF_FIXED_CTR0+n,\ + .dep_pmcs[0] = 1ULL << 16 \ + } + + +static struct pfm_regmap_desc pfm_intel_arch_pmc_desc[] = { +/* pmc0 */ PFM_IA_C(0), PFM_IA_C(1), PFM_IA_C(2), PFM_IA_C(3), +/* pmc4 */ PFM_IA_C(4), PFM_IA_C(5), PFM_IA_C(6), PFM_IA_C(7), +/* pmc8 */ PFM_IA_C(8), PFM_IA_C(9), PFM_IA_C(10), PFM_IA_C(11), +/* pmc12 */ PFM_IA_C(12), PFM_IA_C(13), PFM_IA_C(14), PFM_IA_C(15), + +/* pmc16 */ { .type = PFM_REG_I, + .desc = "FIXED_CTRL", + .dfl_val = 0x8888888888888888ULL, /* force PMI */ + .rsvd_msk = 0, /* set dynamically */ + .no_emul64_msk = 0, + .hw_addr = MSR_CORE_PERF_FIXED_CTR_CTRL + }, +}; +#define PFM_IA_MAX_PMCS ARRAY_SIZE(pfm_intel_arch_pmc_desc) + +static struct pfm_regmap_desc pfm_intel_arch_pmd_desc[] = { +/* pmd0 */ PFM_IA_D(0), PFM_IA_D(1), PFM_IA_D(2), PFM_IA_D(3), +/* pmd4 */ PFM_IA_D(4), PFM_IA_D(5), PFM_IA_D(6), PFM_IA_D(7), +/* pmd8 */ PFM_IA_D(8), PFM_IA_D(9), PFM_IA_D(10), PFM_IA_D(11), +/* pmd12 */ PFM_IA_D(12), PFM_IA_D(13), PFM_IA_D(14), PFM_IA_D(15), + +/* pmd16 */ PFM_IA_FD(0), PFM_IA_FD(1), PFM_IA_FD(2), PFM_IA_FD(3), +/* pmd20 */ PFM_IA_FD(4), PFM_IA_FD(5), PFM_IA_FD(6), PFM_IA_FD(7), +/* pmd24 */ PFM_IA_FD(8), PFM_IA_FD(9), PFM_IA_FD(10), PFM_IA_FD(11), +/* pmd28 */ PFM_IA_FD(16), PFM_IA_FD(17), PFM_IA_FD(18), PFM_IA_FD(19) +}; +#define PFM_IA_MAX_PMDS ARRAY_SIZE(pfm_intel_arch_pmd_desc) + +#define PFM_IA_MAX_CNT 16 /* # generic counters in mapping table */ +#define PFM_IA_MAX_FCNT 16 /* # of fixed counters in mapping table */ +#define PFM_IA_FCNT_BASE 16 /* base index of fixed counters PMD */ + +static struct pfm_pmu_config pfm_intel_arch_pmu_conf; + +static void pfm_intel_arch_check_errata(void) +{ + /* + * Core Duo errata AE49 (no fix). Both counters share a single + * enable bit in PERFEVTSEL0 + */ + if (current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 14) + pfm_intel_arch_pmu_info.flags |= PFM_X86_FL_NO_SHARING; +} + +static inline void set_enable_mask(unsigned int i) +{ + pfm_arch_bv_set_bit(i, enable_mask); + + /* max_enable = highest + 1 */ + if ((i+1) > max_enable) + max_enable = i+ 1; +} + +static void pfm_intel_arch_setup_generic(unsigned int version, + unsigned int width, + unsigned int count) +{ + u64 rsvd; + unsigned int i; + + /* + * first we handle the generic counters: + * + * - ensure HW does not have more registers than hardcoded in the tables + * - adjust rsvd_msk to actual counter width + * - initialize enable_mask (list of PMC with start/stop capability) + * - mark unused hardcoded generic counters as unimplemented + */ + + /* + * min of number of Hw counters and hardcoded in the tables + */ + if (count >= PFM_IA_MAX_CNT) { + printk(KERN_INFO "perfmon: Limiting number of generic counters" + " to %u, HW supports %u", + PFM_IA_MAX_CNT, count); + count = PFM_IA_MAX_CNT; + } + + /* + * adjust rsvd_msk for generic counters based on actual width + * initialize enable_mask (1 per pmd) + */ + rsvd = ~((1ULL << width)-1); + for (i = 0; i < count; i++) { + pfm_intel_arch_pmd_desc[i].rsvd_msk = rsvd; + set_enable_mask(i); + } + + /* + * handle version 3 new anythread bit (21) + */ + if (version == 3) { + for (i = 0; i < count; i++) + pfm_intel_arch_pmc_desc[i].rsvd_msk &= ~(1ULL << 21); + } + + + /* + * mark unused generic counters as not available + */ + for (i = count ; i < PFM_IA_MAX_CNT; i++) { + pfm_intel_arch_pmd_desc[i].type = PFM_REG_NA; + pfm_intel_arch_pmc_desc[i].type = PFM_REG_NA; + } +} + +static void pfm_intel_arch_setup_fixed(unsigned int version, + unsigned int width, + unsigned int count) +{ + u64 rsvd, dfl; + unsigned int i; + + /* + * handle the fixed counters (if any): + * + * - ensure HW does not have more registers than hardcoded in the tables + * - adjust rsvd_msk to actual counter width + * - initialize enable_mask (list of PMC with start/stop capability) + * - mark unused hardcoded generic counters as unimplemented + */ + if (count >= PFM_IA_MAX_FCNT) { + printk(KERN_INFO "perfmon: Limiting number of fixed counters" + " to %u, HW supports %u", + PFM_IA_MAX_FCNT, count); + count = PFM_IA_MAX_FCNT; + } + /* + * adjust rsvd_msk for fixed counters based on actual width + */ + rsvd = ~((1ULL << width)-1); + for (i = 0; i < count; i++) + pfm_intel_arch_pmd_desc[PFM_IA_FCNT_BASE+i].rsvd_msk = rsvd; + + /* + * handle version new anythread bit (bit 2) + */ + if (version == 3) + rsvd = 1ULL << 3; + else + rsvd = 3ULL << 2; + + pfm_intel_arch_pmc_desc[16].rsvd_msk = 0; + for (i = 0; i < count; i++) + pfm_intel_arch_pmc_desc[16].rsvd_msk |= rsvd << (i<<2); + + /* + * mark unused fixed counters as unimplemented + * + * update the rsvd_msk, dfl_val in FIXED_CTRL: + * - rsvd_msk: set all 4 bits + * - dfl_val : clear all 4 bits + */ + dfl = pfm_intel_arch_pmc_desc[16].dfl_val; + rsvd = pfm_intel_arch_pmc_desc[16].rsvd_msk; + + for (i = count ; i < PFM_IA_MAX_FCNT; i++) { + pfm_intel_arch_pmd_desc[PFM_IA_FCNT_BASE+i].type = PFM_REG_NA; + rsvd |= 0xfULL << (i<<2); + dfl &= ~(0xfULL << (i<<2)); + } + + /* + * FIXED_CTR_CTRL unavailable when no fixed counters are defined + */ + if (!count) { + pfm_intel_arch_pmc_desc[16].type = PFM_REG_NA; + } else { + /* update rsvd_mask and dfl_val */ + pfm_intel_arch_pmc_desc[16].rsvd_msk = rsvd; + pfm_intel_arch_pmc_desc[16].dfl_val = dfl; + set_enable_mask(16); + } +} + +static int pfm_intel_arch_probe_pmu(void) +{ + union { + unsigned int val; + struct pmu_eax eax; + struct pmu_edx edx; + } eax, edx; + unsigned int ebx, ecx; + unsigned int width = 0; + + edx.val = 0; + + if (!cpu_has_arch_perfmon) { + PFM_INFO("no support for Intel architectural PMU"); + return -1; + } + + if (!cpu_has_apic) { + PFM_INFO("no Local APIC, try rebooting with lapic option"); + return -1; + } + + /* cpuid() call protected by cpu_has_arch_perfmon */ + cpuid(0xa, &eax.val, &ebx, &ecx, &edx.val); + + /* + * some 6/15 models have buggy BIOS + */ + if (eax.eax.version == 0 + && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 15) { + PFM_INFO("buggy v2 BIOS, adjusting for 2 generic counters"); + eax.eax.version = 2; + eax.eax.num_cnt = 2; + eax.eax.cnt_width = 40; + } + + /* + * some v2 BIOSes are incomplete + */ + if (eax.eax.version == 2 && !edx.edx.num_cnt) { + PFM_INFO("buggy v2 BIOS, adjusting for 3 fixed counters"); + edx.edx.num_cnt = 3; + edx.edx.cnt_width = 40; + } + + /* + * no fixed counters on earlier versions + */ + if (eax.eax.version < 2) { + edx.val = 0; + } else { + /* + * use the min value of both widths until we support + * variable width counters + */ + width = eax.eax.cnt_width < edx.edx.cnt_width ? + eax.eax.cnt_width : edx.edx.cnt_width; + } + + /* + * Intel Atom processors have a buggy firmware which does not report + * the correct number of fixed counters + */ + if (eax.eax.version == 3 && edx.edx.num_cnt < 3 + && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 28) { + PFM_INFO("buggy v3 BIOS, adjusting for 3 fixed counters"); + edx.edx.num_cnt = 3; + } + + PFM_INFO("detected architecural perfmon v%d", eax.eax.version); + PFM_INFO("num_gen=%d width=%d num_fixed=%d width=%d", + eax.eax.num_cnt, + eax.eax.cnt_width, + edx.edx.num_cnt, + edx.edx.cnt_width); + + pfm_intel_arch_setup_generic(eax.eax.version, + width, + eax.eax.num_cnt); + + pfm_intel_arch_setup_fixed(eax.eax.version, + width, + edx.edx.num_cnt); + + pfm_intel_arch_check_errata(); + + pfm_intel_arch_version = eax.eax.version; + + return 0; +} + +/** + * pfm_intel_arch_has_ovfls - check for pending overflow condition + * @ctx: context to work on + * + * detect if counters have overflowed. + * return: + * 0 : no overflow + * 1 : at least one overflow + */ +static int __kprobes pfm_intel_arch_has_ovfls(struct pfm_context *ctx) +{ + u64 *cnt_mask; + u64 wmask, val; + u16 i, num; + + cnt_mask = ctx->regs.cnt_pmds; + num = ctx->regs.num_counters; + wmask = 1ULL << pfm_pmu_conf->counter_width; + + /* + * we can leverage the fact that we know the mapping + * to hardcode the MSR address and avoid accessing + * more cachelines + * + * We need to check cnt_mask because not all registers + * may be available. + */ + for (i = 0; num; i++) { + if (pfm_arch_bv_test_bit(i, cnt_mask)) { + rdmsrl(pfm_intel_arch_pmd_desc[i].hw_addr, val); + if (!(val & wmask)) + return 1; + num--; + } + } + return 0; +} + +static int pfm_intel_arch_stop_save(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + u64 used_mask[PFM_PMC_BV]; + u64 val, wmask, ovfl_mask; + u32 i, count; + + wmask = 1ULL << pfm_pmu_conf->counter_width; + + pfm_arch_bv_and(used_mask, + set->used_pmcs, + enable_mask, + max_enable); + + count = pfm_arch_bv_weight(used_mask, max_enable); + + /* + * stop monitoring + * Unfortunately, this is very expensive! + * wrmsrl() is serializing. + */ + for (i = 0; count; i++) { + if (pfm_arch_bv_test_bit(i, used_mask)) { + wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0); + count--; + } + } + + /* + * if we already having a pending overflow condition, we simply + * return to take care of this first. + */ + if (set->npend_ovfls) + return 1; + + ovfl_mask = pfm_pmu_conf->ovfl_mask; + + /* + * check for pending overflows and save PMDs (combo) + * we employ used_pmds because we also need to save + * and not just check for pending interrupts. + * + * all pmds are counters + */ + count = set->nused_pmds; + for (i = 0; count; i++) { + if (pfm_arch_bv_test_bit(i, set->used_pmds)) { + val = pfm_arch_read_pmd(ctx, i); + if (!(val & wmask)) { + pfm_arch_bv_set_bit(i, set->povfl_pmds); + set->npend_ovfls++; + } + val = (set->pmds[i] & ~ovfl_mask) + | (val & ovfl_mask); + set->pmds[i] = val; + count--; + } + } + /* 0 means: no need to save PMDs at upper level */ + return 0; +} + +/** + * pfm_intel_arch_quiesce - stop monitoring without grabbing any lock + * + * called from NMI interrupt handler to immediately stop monitoring + * cannot grab any lock, including perfmon related locks + */ +static void __kprobes pfm_intel_arch_quiesce(void) +{ + u16 i; + + /* + * PMC16 is the fixed control register so it has a + * distinct MSR address + * + * We do not use the hw_addr field in the table to avoid touching + * too many cachelines + */ + for (i = 0; i < pfm_pmu_conf->regs_all.max_pmc; i++) { + if (pfm_arch_bv_test_bit(i, pfm_pmu_conf->regs_all.pmcs)) { + if (i == 16) + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); + else + wrmsrl(MSR_P6_EVNTSEL0+i, 0); + } + } +} +/** +* pfm_intel_arch_release_pmu_percpu - release PMU resource per CPU +* +* Since v2, there exists global control MSR, to start/stop and +* also collect overflow status information. In particular, +* GLOBAL_CTRL controls start/stop and has one bit per counter. +* To maintain backward compatibility with v1, the power-on value +* of GLOBAL_CTRL should be such that generic counters are enabled +* but fixed counters are disabled (true on Penryn and Atom currently). +* +* Here, we simply make sure that all available counters are enabled. +* After that, start/stop is controlled on a per-counter basis. +*/ +static void pfm_intel_arch_acquire_pmu_percpu(void) +{ + struct pfm_regmap_desc *d; + u64 mask = 0; + unsigned int i; + + /* nothing to do for v1 */ + if (pfm_intel_arch_version < 2) + return; + + /* + * build bitmask of registers that are available to + * us. In some cases, there may be fewer registers than + * what the PMU supports due to sharing with other kernel + * subsystems, such as NMI + */ + d = pfm_pmu_conf->pmd_desc; + for (i=0; i < 16; i++) { + if ((d[i].type & PFM_REG_I) == 0) + continue; + mask |= 1ull << i; + } + for (i=16; i < PFM_IA_MAX_PMDS; i++) { + if ((d[i].type & PFM_REG_I) == 0) + continue; + mask |= 1ull << (32+i-16); + } + /* + * keep a local copy of the current MSR_CORE_PERF_GLOBAL_CTRL + */ + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, __get_cpu_var(saved_global_ctrl)); + + PFM_DBG("global=0x%llx set to 0x%llx", + __get_cpu_var(saved_global_ctrl), + mask); + /* + * enable all registers + * + * No need to quiesce PMU. If there is a overflow, it will be + * treated as spurious by the handler + */ + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, mask); +} + +/** +* pfm_intel_arch_release_pmu_percpu - release PMU resource per CPU +* +* Since v2, there exists global control MSR, to start/stop and +* also collect overflow status information. In particular, +* GLOBAL_CTRL controls start/stop and has one bit per counter. +* To maintain backward compatibility with v1, the power-on value +* of GLOBAL_CTRL should be such that generic counters are enabled +* but fixed counters are disabled (true on Penryn and Atom currently). +* +* Here, we are done using the PMU. so we restore the power-on value. +*/ +static void pfm_intel_arch_release_pmu_percpu(void) +{ + /* nothing to do for v1 */ + if (pfm_intel_arch_version < 2) + return; + + PFM_DBG("global_ctrl restored to 0x%llx\n", + __get_cpu_var(saved_global_ctrl)); + + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, __get_cpu_var(saved_global_ctrl)); +} + +/* + * Counters may have model-specific width. Yet the documentation says + * that only the lower 32 bits can be written to due to the specification + * of wrmsr. bits [32-(w-1)] are sign extensions of bit 31. Bits [w-63] must + * not be set (see rsvd_msk for PMDs). As such the effective width of a + * counter is 31 bits only regardless of what CPUID.0xa returns. + * + * See IA-32 Intel Architecture Software developer manual Vol 3B chapter 18 + */ +static struct pfm_pmu_config pfm_intel_arch_pmu_conf = { + .pmu_name = "Intel architectural", + .pmd_desc = pfm_intel_arch_pmd_desc, + .counter_width = 31, + .num_pmc_entries = PFM_IA_MAX_PMCS, + .num_pmd_entries = PFM_IA_MAX_PMDS, + .pmc_desc = pfm_intel_arch_pmc_desc, + .version = "1.0", + .pmu_info = &pfm_intel_arch_pmu_info +}; + +static int __init pfm_intel_arch_pmu_init_module(void) +{ + if (pfm_intel_arch_probe_pmu()) + return -ENOSYS; + + return pfm_pmu_register(&pfm_intel_arch_pmu_conf); +} + +device_initcall(pfm_intel_arch_pmu_init_module); diff --git a/include/linux/perfmon.h b/include/linux/perfmon.h new file mode 100644 index 000000000000..6117e605a43b --- /dev/null +++ b/include/linux/perfmon.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@hpl.hp.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ + +#ifndef __LINUX_PERFMON_H__ +#define __LINUX_PERFMON_H__ + +/* + * This file contains all the user visible generic definitions for the + * interface. Model-specific user-visible definitions are located in + * the asm/perfmon.h file. + */ + +/* + * include arch-specific user interface definitions + */ +#include <asm/perfmon.h> + +/* + * defined by each arch + */ +#define PFM_MAX_PMCS PFM_ARCH_MAX_PMCS +#define PFM_MAX_PMDS PFM_ARCH_MAX_PMDS + +/* + * number of elements for each type of bitvector + * all bitvectors use u64 fixed size type on all architectures. + */ +#define PFM_BVSIZE(x) (((x)+(sizeof(__u64)<<3)-1) / (sizeof(__u64)<<3)) +#define PFM_PMD_BV PFM_BVSIZE(PFM_MAX_PMDS) +#define PFM_PMC_BV PFM_BVSIZE(PFM_MAX_PMCS) + +/* + * argument to pfm_create + * populated on return + */ +struct pfarg_sinfo { + __u64 sif_avail_pmcs[PFM_PMC_BV];/* out: available PMCs */ + __u64 sif_avail_pmds[PFM_PMD_BV];/* out: available PMDs */ + __u64 sif_reserved1[4]; /* for future use */ +}; + +/* + * PMC and PMD generic register description + */ +struct pfarg_pmr { + __u16 reg_num; /* which register */ + __u16 reg_res1; /* reserved */ + __u32 reg_flags; /* REGFL flags */ + __u64 reg_value; /* 64-bit value */ +}; + +/* + * pfm_write, pfm_read type: + */ +#define PFM_RW_PMD 0x01 /* accessing PMD registers */ +#define PFM_RW_PMC 0x02 /* accessing PMC registers */ + +/* + * pfm_set_state state: + */ +#define PFM_ST_START 0x01 /* start monitoring */ +#define PFM_ST_STOP 0x02 /* stop monitoring */ + +/* + * pfm_attach special target to trigger detach + */ +#define PFM_NO_TARGET -1 /* detach session target */ + +/* + * default value for the user and group security parameters in + * /proc/sys/kernel/perfmon/sys_group + * /proc/sys/kernel/perfmon/task_group + */ +#define PFM_GROUP_PERM_ANY -1 /* any user/group */ + +/* + * perfmon version number + */ +#define PFM_VERSION_MAJ 3U +#define PFM_VERSION_MIN 0U +#define PFM_VERSION (((PFM_VERSION_MAJ&0xffff)<<16)|\ + (PFM_VERSION_MIN & 0xffff)) +#define PFM_VERSION_MAJOR(x) (((x)>>16) & 0xffff) +#define PFM_VERSION_MINOR(x) ((x) & 0xffff) + +#endif /* __LINUX_PERFMON_H__ */ diff --git a/include/linux/perfmon_kern.h b/include/linux/perfmon_kern.h new file mode 100644 index 000000000000..e21cd835bd2c --- /dev/null +++ b/include/linux/perfmon_kern.h @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ + +#ifndef __LINUX_PERFMON_KERN_H__ +#define __LINUX_PERFMON_KERN_H__ +/* + * This file contains all the definitions of data structures, variables, macros + * that are to be shared between generic code and arch-specific code + * + * For generic only definitions, use perfmon/perfmon_priv.h + */ +#include <linux/file.h> +#include <linux/sched.h> +#include <linux/perfmon.h> + +#ifdef CONFIG_PERFMON + +/* + * system adminstrator configuration controls available via + * the /sys/kerne/perfmon interface + */ +struct pfm_controls { + u32 debug; /* debugging control bitmask */ + gid_t task_group; /* gid to create a per-task context */ + size_t arg_mem_max; /* maximum vector argument size */ +}; +extern struct pfm_controls pfm_controls; + +/* + * event_set: encapsulates the full PMU state + */ +struct pfm_event_set { + u16 nused_pmds; /* max number of used PMDs */ + u16 nused_pmcs; /* max number of used PMCs */ + u32 priv_flags; /* private flags (see below) */ + u32 npend_ovfls; /* number of pending PMD overflow */ + u32 pad1; /* padding */ + u64 used_pmds[PFM_PMD_BV]; /* used PMDs */ + u64 povfl_pmds[PFM_PMD_BV]; /* pending overflowed PMDs */ + u64 used_pmcs[PFM_PMC_BV]; /* used PMCs */ + u64 pmcs[PFM_MAX_PMCS]; /* PMC values */ + u64 pmds[PFM_MAX_PMDS]; /* PMD values */ +}; + +/* + * common private event set flags (priv_flags) + * + * upper 16 bits: for arch-specific use + * lower 16 bits: for common use + */ +#define PFM_SETFL_PRIV_MOD_PMDS 0x1 /* PMD register(s) modified */ +#define PFM_SETFL_PRIV_MOD_PMCS 0x2 /* PMC register(s) modified */ +#define PFM_SETFL_PRIV_MOD_BOTH (PFM_SETFL_PRIV_MOD_PMDS \ + | PFM_SETFL_PRIV_MOD_PMCS) + + +/* + * context flags + */ +struct pfm_context_flags { + unsigned int started:1; /* pfm_start() issued */ + unsigned int is_self:1; /* per-thread and self-montoring */ + unsigned int work_type:2; /* type of work for pfm_handle_work */ + unsigned int reserved:28; /* for future use */ +}; +/* + * values for work_type (TIF_PERFMON_WORK must be set) + */ +#define PFM_WORK_NONE 0 /* nothing to do */ +#define PFM_WORK_ZOMBIE 1 /* cleanup zombie context */ + + +/* + * perfmon context state + */ +#define PFM_CTX_UNLOADED 1 /* context is detached */ +#define PFM_CTX_LOADED 2 /* context is attached */ +#define PFM_CTX_ZOMBIE 3 /* context lost owner but still attached */ + +/* + * registers description + */ +struct pfm_regdesc { + u64 pmcs[PFM_PMC_BV]; /* available PMC */ + u64 pmds[PFM_PMD_BV]; /* available PMD */ + u64 rw_pmds[PFM_PMD_BV]; /* available RW PMD */ + u64 intr_pmds[PFM_PMD_BV]; /* PMD generating intr */ + u64 cnt_pmds[PFM_PMD_BV]; /* PMD counters */ + u16 max_pmc; /* highest+1 avail PMC */ + u16 max_pmd; /* highest+1 avail PMD */ + u16 max_rw_pmd; /* highest+1 avail RW PMD */ + u16 first_intr_pmd; /* first intr PMD */ + u16 max_intr_pmd; /* highest+1 intr PMD */ + u16 num_rw_pmd; /* number of avail RW PMD */ + u16 num_pmcs; /* number of logical PMCS */ + u16 num_pmds; /* number of logical PMDS */ + u16 num_counters; /* number of counting PMD */ +}; + + +/* + * context: contains all the state of a session + */ +struct pfm_context { + spinlock_t lock; /* context protection */ + + struct pfm_context_flags flags; + u32 state; /* current state */ + struct task_struct *task; /* attached task */ + + u64 last_act; /* last activation */ + u32 last_cpu; /* last CPU used (SMP only) */ + + struct pfm_event_set *active_set; /* active set */ + struct pfm_event_set _set0; /* event set 0 */ + + struct pfm_regdesc regs; /* registers available to context */ +}; + +/* + * logging + */ +#define PFM_ERR(f, x...) printk(KERN_ERR "perfmon: " f "\n", ## x) +#define PFM_WARN(f, x...) printk(KERN_WARNING "perfmon: " f "\n", ## x) +#define PFM_LOG(f, x...) printk(KERN_NOTICE "perfmon: " f "\n", ## x) +#define PFM_INFO(f, x...) printk(KERN_INFO "perfmon: " f "\n", ## x) + +/* + * debugging + * + * Printk rate limiting is enforced to avoid getting flooded with too many + * error messages on the console (which could render the machine unresponsive). + * To get full debug output (turn off ratelimit): + * $ echo 0 >/proc/sys/kernel/printk_ratelimit + * + * debug is a bitmask where bits are defined as follows: + * bit 0: enable non-interrupt code degbug messages + * bit 1: enable interrupt code debug messages + */ +#ifdef CONFIG_PERFMON_DEBUG +#define _PFM_DBG(lm, f, x...) \ + do { \ + if (unlikely((pfm_controls.debug & lm) && printk_ratelimit())) { \ + printk("perfmon: %s.%d: CPU%d [%d]: " f "\n", \ + __func__, __LINE__, \ + smp_processor_id(), current->pid , ## x); \ + } \ + } while (0) + +#define PFM_DBG(f, x...) _PFM_DBG(0x1, f, ##x) +#define PFM_DBG_ovfl(f, x...) _PFM_DBG(0x2, f, ##x) +#else +#define PFM_DBG(f, x...) do {} while (0) +#define PFM_DBG_ovfl(f, x...) do {} while (0) +#endif + +extern struct pfm_pmu_config *pfm_pmu_conf; +extern int perfmon_disabled; + +static inline struct pfm_arch_context *pfm_ctx_arch(struct pfm_context *c) +{ + return (struct pfm_arch_context *)(c+1); +} + +#include <linux/perfmon_pmu.h> + +extern const struct file_operations pfm_file_ops; + +void pfm_handle_work(struct pt_regs *regs); +void __pfm_exit_thread(void); +void pfm_ctxsw_in(struct task_struct *prev, struct task_struct *next); +void pfm_ctxsw_out(struct task_struct *prev, struct task_struct *next); +void __pfm_init_percpu(void *dummy); + +void pfm_interrupt_handler(unsigned long ip, struct pt_regs *regs); + +int pfm_session_allcpus_acquire(void); +void pfm_session_allcpus_release(void); + +static inline void pfm_exit_thread(void) +{ + if (current->pfm_context) + __pfm_exit_thread(); +} + +/* + * include arch-specific kernel level definitions + */ +#include <asm/perfmon_kern.h> + +static inline void pfm_copy_thread(struct task_struct *task) +{ + /* + * context or perfmon TIF state is NEVER inherited + * in child task. Holds for per-thread and system-wide + */ + task->pfm_context = NULL; + clear_tsk_thread_flag(task, TIF_PERFMON_CTXSW); +} + +/* + * read a single PMD register. + */ +static inline u64 pfm_read_pmd(struct pfm_context *ctx, unsigned int cnum) +{ + return pfm_arch_read_pmd(ctx, cnum); +} +/* + * write a single PMD register. + */ +static inline void pfm_write_pmd(struct pfm_context *ctx, unsigned int cnum, + u64 value) +{ + /* + * PMD writes are ignored for read-only registers + */ + if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_RO) + return; + + /* + * clear unimplemented bits + */ + value &= ~pfm_pmu_conf->pmd_desc[cnum].rsvd_msk; + + pfm_arch_write_pmd(ctx, cnum, value); +} + +DECLARE_PER_CPU(struct pfm_context *, pmu_ctx); +DECLARE_PER_CPU(struct task_struct *, pmu_owner); + +/* + * number of u64 to use for stack buffer in + * syscalls which take vector argument + */ +#ifndef PFM_ARCH_STK_ARG +#define PFM_ARCH_STK_ARG 2 +#endif + +#define PFM_STK_ARG PFM_ARCH_STK_ARG + +#else /* !CONFIG_PERFMON */ +/* + * perfmon hooks are nops when CONFIG_PERFMON is undefined + */ + +static inline void pfm_exit_thread(void) +{} + +static inline void pfm_handle_work(struct pt_regs *regs) +{} + +static inline void pfm_copy_thread(struct task_struct *t) +{} + +static inline void pfm_ctxsw_in(struct task_struct *p, struct task_struct *n) +{} + +static inline void pfm_ctxsw_out(struct task_struct *p, struct task_struct *n) +{} + +static inline void pfm_session_allcpus_release(void) +{} + +static inline int pfm_session_allcpus_acquire(void) +{ + return 0; +} +#endif /* CONFIG_PERFMON */ +#endif /* __LINUX_PERFMON_KERN_H__ */ diff --git a/include/linux/perfmon_pmu.h b/include/linux/perfmon_pmu.h new file mode 100644 index 000000000000..13d357140243 --- /dev/null +++ b/include/linux/perfmon_pmu.h @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@hpl.hp.com> + * + * Interface for PMU description modules + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef __PERFMON_PMU_H__ +#define __PERFMON_PMU_H__ 1 + +/* + * generic information about a PMC or PMD register + */ +struct pfm_regmap_desc { + u16 type; /* register infos */ + u16 reserved1; /* for future use */ + u32 reserved2; /* for future use */ + u64 dfl_val; /* power-on default value (quiescent) */ + u64 rsvd_msk; /* reserved bits: 1 means reserved */ + u64 no_emul64_msk; /* bits to clear for PFM_REGFL_NO_EMUL64 */ + unsigned long hw_addr; /* HW register address or index */ + struct kobject kobj; /* for internal use only */ + char *desc; /* HW register description string */ + u64 dep_pmcs[PFM_PMC_BV];/* depending PMC registers */ +}; + +/* + * pfm_reg_desc helper macros + */ +#define PMC_D(t, d, v, r, n, h) \ + { .type = t, \ + .desc = d, \ + .dfl_val = v, \ + .rsvd_msk = r, \ + .no_emul64_msk = n, \ + .hw_addr = h \ + } + +#define PMD_D(t, d, h) \ + { .type = t, \ + .desc = d, \ + .rsvd_msk = 0, \ + .no_emul64_msk = 0, \ + .hw_addr = h \ + } + +#define PMD_DR(t, d, h, r) \ + { .type = t, \ + .desc = d, \ + .rsvd_msk = r, \ + .no_emul64_msk = 0, \ + .hw_addr = h \ + } + +#define PMX_NA \ + { .type = PFM_REG_NA } + +/* + * type of a PMU register (16-bit bitmask) for use with pfm_reg_desc.type + */ +#define PFM_REG_NA 0x00 /* not avail. (not impl.,no access) must be 0 */ +#define PFM_REG_I 0x01 /* PMC/PMD: implemented */ +#define PFM_REG_WC 0x02 /* PMC: has write_checker */ +#define PFM_REG_C64 0x04 /* PMD: 64-bit virtualization */ +#define PFM_REG_RO 0x08 /* PMD: read-only (writes ignored) */ +#define PFM_REG_INTR 0x20 /* PMD: register can generate interrupt */ +#define PFM_REG_NO64 0x100 /* PMC: supports PFM_REGFL_NO_EMUL64 */ + +/* + * define some shortcuts for common types + */ +#define PFM_REG_W (PFM_REG_WC|PFM_REG_I) +#define PFM_REG_W64 (PFM_REG_WC|PFM_REG_NO64|PFM_REG_I) +#define PFM_REG_C (PFM_REG_C64|PFM_REG_INTR|PFM_REG_I) +#define PFM_REG_I64 (PFM_REG_NO64|PFM_REG_I) +#define PFM_REG_IRO (PFM_REG_I|PFM_REG_RO) + +typedef int (*pfm_pmc_check_t)(struct pfm_context *ctx, + struct pfm_event_set *set, + struct pfarg_pmr *req); + +typedef int (*pfm_pmd_check_t)(struct pfm_context *ctx, + struct pfm_event_set *set, + struct pfarg_pmr *req); + +/* + * structure used by pmu description modules + * + * probe_pmu() routine return value: + * - 1 means recognized PMU + * - 0 means not recognized PMU + */ +struct pfm_pmu_config { + char *pmu_name; /* PMU family name */ + char *version; /* config module version */ + + int counter_width; /* width of hardware counter */ + + struct pfm_regmap_desc *pmc_desc; /* PMC register descriptions */ + struct pfm_regmap_desc *pmd_desc; /* PMD register descriptions */ + + pfm_pmc_check_t pmc_write_check;/* write checker (optional) */ + pfm_pmd_check_t pmd_write_check;/* write checker (optional) */ + pfm_pmd_check_t pmd_read_check; /* read checker (optional) */ + + u16 num_pmc_entries;/* #entries in pmc_desc */ + u16 num_pmd_entries;/* #entries in pmd_desc */ + void *pmu_info; /* model-specific infos */ + /* + * fields computed internally, do not set in module + */ + struct pfm_regdesc regs_all; /* regs available to all */ + u64 ovfl_mask; /* overflow mask */ +}; + +static inline void *pfm_pmu_info(void) +{ + return pfm_pmu_conf->pmu_info; +} + +int pfm_pmu_register(struct pfm_pmu_config *cfg); + +int pfm_sysfs_add_pmu(struct pfm_pmu_config *pmu); + +#endif /* __PERFMON_PMU_H__ */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 755b5705cd38..8e23536e66be 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1306,6 +1306,10 @@ struct task_struct { unsigned long default_timer_slack_ns; struct list_head *scm_work_list; + +#if defined(CONFIG_PERFMON_V20) || defined(CONFIG_PERFMON) + struct pfm_context *pfm_context; +#endif }; /* diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index d6ff145919ca..d12a175e0f43 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -625,4 +625,15 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); +#ifdef CONFIG_PERFMON_V20 +struct pfarg_sinfo; +asmlinkage long sys_pfm_create(int flags, struct pfarg_sinfo *s, + char __user *f, void __user *uarg, size_t uarg_size); + +asmlinkage long sys_pfm_write(int fd, int flags, int type, void __user *arg, size_t s); +asmlinkage long sys_pfm_read(int fd, int flags, int type, void __user *arg, size_t s); +asmlinkage long sys_pfm_attach(int fd, int flags, int target); +asmlinkage long sys_pfm_set_state(int fd, int flags, int state); +#endif /* CONFIG_PERFMON_V20 */ + #endif diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index a77b27b11b04..1432b300e1ca 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -174,3 +174,10 @@ cond_syscall(compat_sys_timerfd_settime); cond_syscall(compat_sys_timerfd_gettime); cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); + +/* perfmon */ +cond_syscall(sys_pfm_create); +cond_syscall(sys_pfm_write); +cond_syscall(sys_pfm_read); +cond_syscall(sys_pfm_attach); +cond_syscall(sys_pfm_set_state); diff --git a/perfmon/Makefile b/perfmon/Makefile new file mode 100644 index 000000000000..4ee61aa50675 --- /dev/null +++ b/perfmon/Makefile @@ -0,0 +1,10 @@ +# +# Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. +# Contributed by Stephane Eranian <eranian@gmail.com> +# +obj-y = perfmon_ctx.o perfmon_file.o \ + perfmon_attach.o perfmon_res.o \ + perfmon_init.o perfmon_activate.o \ + perfmon_intr.o perfmon_rw.o \ + perfmon_ctxsw.o perfmon_pmu.o \ + perfmon_syscalls.o perfmon_sysfs.o diff --git a/perfmon/perfmon_activate.c b/perfmon/perfmon_activate.c new file mode 100644 index 000000000000..9398e7c15215 --- /dev/null +++ b/perfmon/perfmon_activate.c @@ -0,0 +1,136 @@ +/* + * perfmon_activate.c: perfmon2 start/stop functions + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@hpl.hp.com> + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * More information about perfmon available at: + * http://www.hpl.hp.com/research/linux/perfmon + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include <linux/kernel.h> +#include <linux/perfmon_kern.h> +#include "perfmon_priv.h" + +/** + * __pfm_start - activate monitoring + * @ctx: context to operate on + * @start: pfarg_start as passed by user + * + * When operating in per-thread mode and not self-monitoring, the monitored + * thread must be stopped. Activation will be effective next time the thread + * is context switched in. + * + * The pfarg_start argument is optional and may be used to designate + * the initial event set to activate. When not provided, the last active + * set is used. For the first activation, set0 is used when start is NULL. + * + * On some architectures, e.g., IA-64, it may be possible to start monitoring + * without calling this function under certain conditions (per-thread and self + * monitoring). In this case, either set0 or the last active set is used. + * + * the context is locked and interrupts are disabled. + */ +int __pfm_start(struct pfm_context *ctx) +{ + struct task_struct *task; + struct pfm_event_set *set; + + task = ctx->task; + + /* + * UNLOADED: error + * LOADED : normal start, nop if started + * ZOMBIE : cannot happen + */ + if (ctx->state == PFM_CTX_UNLOADED) + return -EINVAL; + + set = ctx->active_set; + + /* + * mark as started + * must be done before calling pfm_arch_start() + */ + ctx->flags.started = 1; + + pfm_arch_start(task, ctx); + + /* + * we check whether we had a pending ovfl before restarting. + * If so we need to regenerate the interrupt to make sure we + * keep recorded samples. For non-self monitoring this check + * is done in the pfm_ctxswin_thread() routine. + * + * we check new_set/old_set because pfm_switch_sets() already + * takes care of replaying the pending interrupts + */ + if (task == current && set->npend_ovfls) + pfm_arch_resend_irq(ctx); + + return 0; +} + +/** + * __pfm_stop - stop monitoring + * @ctx: context to operate on + * + * When operating in per-thread* mode and when not self-monitoring, + * the monitored thread must be stopped. + * + * the context is locked and interrupts are disabled. + */ +int __pfm_stop(struct pfm_context *ctx) +{ + struct task_struct *task; + + /* + * context must be attached (zombie cannot happen) + */ + if (ctx->state == PFM_CTX_UNLOADED) + return -EINVAL; + + task = ctx->task; + + PFM_DBG("ctx_task=[%d] ctx_state=%d is_system=%d", + task ? task->pid : -1, + ctx->state, + !task); + + pfm_arch_stop(task, ctx); + + ctx->flags.started = 0; + /* + * starting now, in-flight PMU interrupt for this context + * are treated as spurious + */ + return 0; +} diff --git a/perfmon/perfmon_attach.c b/perfmon/perfmon_attach.c new file mode 100644 index 000000000000..4ef00982f218 --- /dev/null +++ b/perfmon/perfmon_attach.c @@ -0,0 +1,337 @@ +/* + * perfmon_attach.c: perfmon2 load/unload functions + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@hpl.hp.com> + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * More information about perfmon available at: + * http://www.hpl.hp.com/research/linux/perfmon + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/perfmon_kern.h> +#include "perfmon_priv.h" + +/** + * __pfm_load_ctx_thread - attach context to a thread + * @ctx: context to operate on + * @task: thread to attach to + * + * The function must be called with the context locked and interrupts disabled. + */ +static int pfm_load_ctx_thread(struct pfm_context *ctx, + struct task_struct *task) +{ + struct pfm_event_set *set; + struct pfm_context *old; + int ret; + u16 max; + + PFM_DBG("pid=%d", task->pid); + + /* + * we must use cmpxchg to avoid race condition with another + * context trying to attach to the same task. + * + * per-thread: + * - task to attach to is checked in sys_pfm_load_context() to avoid + * locking issues. if found, and not self, task refcount was + * incremented. + */ + old = cmpxchg(&task->pfm_context, NULL, ctx); + if (old) { + PFM_DBG("load_pid=%d has a context " + "old=%p new=%p cur=%p", + task->pid, + old, + ctx, + task->pfm_context); + return -EEXIST; + } + + /* + * initialize sets + */ + set = ctx->active_set; + + /* + * cleanup bitvectors + */ + max = ctx->regs.max_intr_pmd; + pfm_arch_bv_zero(set->povfl_pmds, max); + + set->npend_ovfls = 0; + + /* + * we cannot just use plain clear because of arch-specific flags + */ + set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH; + + /* + * link context to task + */ + ctx->task = task; + + /* + * perform any architecture specific actions + */ + ret = pfm_arch_load_context(ctx); + if (ret) + goto error_noload; + + /* + * now reserve the session, before we can proceed with + * actually accessing the PMU hardware + */ + ret = pfm_session_acquire(); + if (ret) + goto error; + + + if (ctx->task != current) { + + /* not self-monitoring */ + ctx->flags.is_self = 0; + + /* force a full reload */ + ctx->last_act = PFM_INVALID_ACTIVATION; + ctx->last_cpu = -1; + set->priv_flags |= PFM_SETFL_PRIV_MOD_BOTH; + + } else { + /* + * on UP, we may have to push out the PMU + * state of the last monitored thread + */ + pfm_check_save_prev_ctx(); + + ctx->last_cpu = smp_processor_id(); + __get_cpu_var(pmu_activation_number)++; + ctx->last_act = __get_cpu_var(pmu_activation_number); + + ctx->flags.is_self = 1; + + /* + * load PMD from set + * load PMC from set + */ + pfm_arch_restore_pmds(ctx, set); + pfm_arch_restore_pmcs(ctx, set); + + /* + * set new ownership + */ + pfm_set_pmu_owner(ctx->task, ctx); + } + + /* + * will cause switch_to() to invoke PMU + * context switch code + */ + set_tsk_thread_flag(task, TIF_PERFMON_CTXSW); + + ctx->state = PFM_CTX_LOADED; + + return 0; + +error: + pfm_arch_unload_context(ctx); + ctx->task = NULL; +error_noload: + /* + * detach context + */ + task->pfm_context = NULL; + return ret; +} + +/** + * __pfm_load_context - attach context to a thread + * @ctx: context to operate on + * @task: thread to attach to + */ +int __pfm_load_context(struct pfm_context *ctx, struct task_struct *task) +{ + return pfm_load_ctx_thread(ctx, task); +} + +/** + * pfm_update_ovfl_pmds - account for pending ovfls on PMDs + * @ctx: context to operate on + * + * This function is always called after pfm_stop has been issued + */ +static void pfm_update_ovfl_pmds(struct pfm_context *ctx) +{ + struct pfm_event_set *set; + u64 *cnt_pmds; + u64 ovfl_mask; + u16 num_ovfls, i; + + ovfl_mask = pfm_pmu_conf->ovfl_mask; + cnt_pmds = ctx->regs.cnt_pmds; + set = ctx->active_set; + + if (!set->npend_ovfls) + return; + + num_ovfls = set->npend_ovfls; + PFM_DBG("novfls=%u", num_ovfls); + + for (i = 0; num_ovfls; i++) { + if (pfm_arch_bv_test_bit(i, set->povfl_pmds)) { + /* only correct value for counters */ + if (pfm_arch_bv_test_bit(i, cnt_pmds)) + set->pmds[i] += 1 + ovfl_mask; + num_ovfls--; + } + PFM_DBG("pmd%u val=0x%llx", + i, + (unsigned long long)set->pmds[i]); + } + /* + * we need to clear to prevent a pfm_getinfo_evtsets() from + * returning stale data even after the context is unloaded + */ + set->npend_ovfls = 0; + pfm_arch_bv_zero(set->povfl_pmds, ctx->regs.max_intr_pmd); +} + +/** + * __pfm_unload_context - detach context from CPU or thread + * @ctx: context to operate on + * + * The function must be called with the context locked and interrupts disabled. + */ +int __pfm_unload_context(struct pfm_context *ctx) +{ + int ret; + + PFM_DBG("ctx_state=%d task [%d]", + ctx->state, + ctx->task ? ctx->task->pid : -1); + + /* + * check unload-able state + */ + if (ctx->state == PFM_CTX_UNLOADED) + return -EINVAL; + + /* + * stop monitoring + */ + ret = __pfm_stop(ctx); + if (ret) + return ret; + + ctx->state = PFM_CTX_UNLOADED; + + /* + * save active set + * UP: + * if not current task and due to lazy, state may + * still be live + * for system-wide, guaranteed to run on correct CPU + */ + if (__get_cpu_var(pmu_ctx) == ctx) { + /* + * pending overflows have been saved by pfm_stop() + */ + pfm_save_pmds(ctx); + pfm_set_pmu_owner(NULL, NULL); + PFM_DBG("released ownership"); + } + + /* + * account for pending overflows + */ + pfm_update_ovfl_pmds(ctx); + + /* + * arch-specific unload operations + */ + pfm_arch_unload_context(ctx); + + /* + * per-thread: disconnect from monitored task + */ + if (ctx->task) { + ctx->task->pfm_context = NULL; + clear_tsk_thread_flag(ctx->task, TIF_PERFMON_CTXSW); + ctx->task = NULL; + } + return 0; +} + +/** + * __pfm_exit_thread - detach and free context on thread exit + */ +void __pfm_exit_thread(void) +{ + struct pfm_context *ctx; + unsigned long flags; + int free_ok = 0, ret = -1; + + ctx = current->pfm_context; + + spin_lock_irqsave(&ctx->lock, flags); + + PFM_DBG("state=%d is_self=%d", ctx->state, ctx->flags.is_self); + + /* + * __pfm_unload_context() cannot fail + * in the context states we are interested in + */ + switch (ctx->state) { + case PFM_CTX_LOADED: + ret = __pfm_unload_context(ctx); + break; + case PFM_CTX_ZOMBIE: + ret = __pfm_unload_context(ctx); + free_ok = 1; + break; + default: + BUG_ON(ctx->state != PFM_CTX_LOADED); + break; + } + spin_unlock_irqrestore(&ctx->lock, flags); + + if (!ret) + pfm_session_release(); + + /* + * All memory free operations (especially for vmalloc'ed memory) + * MUST be done with interrupts ENABLED. + */ + if (free_ok) + pfm_free_context(ctx); +} diff --git a/perfmon/perfmon_ctx.c b/perfmon/perfmon_ctx.c new file mode 100644 index 000000000000..985977069a40 --- /dev/null +++ b/perfmon/perfmon_ctx.c @@ -0,0 +1,400 @@ +/* + * perfmon_ctx.c: perfmon2 context functions + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@hpl.hp.com> + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * More information about perfmon available at: + * http://www.hpl.hp.com/research/linux/perfmon + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/fdtable.h> +#include <linux/perfmon_kern.h> +#include "perfmon_priv.h" + +/* + * context memory pool pointer + */ +static struct kmem_cache *pfm_ctx_cachep; + +/* + * This function is called when we need to perform asynchronous + * work on a context. This function is called ONLY when about to + * return to user mode (very much like with signal handling). + * + * we come here if: + * + * - we are zombie and we need to cleanup our state + * + * pfm_handle_work() can be called with interrupts enabled + * (TIF_NEED_RESCHED) or disabled. + */ +void pfm_handle_work(struct pt_regs *regs) +{ + struct pfm_context *ctx; + unsigned long flags; + int type; + + if (!user_mode(regs)) + return; + + clear_thread_flag(TIF_PERFMON_WORK); + + ctx = current->pfm_context; + if (ctx == NULL) { + PFM_DBG("[%d] has no ctx", current->pid); + return; + } + + spin_lock_irqsave(&ctx->lock, flags); + + type = ctx->flags.work_type; + ctx->flags.work_type = PFM_WORK_NONE; + + PFM_DBG("work_type=%d", type); + + switch (type) { + case PFM_WORK_ZOMBIE: + goto do_zombie; + default: + PFM_DBG("unkown type=%d", type); + goto nothing_todo; + } +nothing_todo: + /* + * restore flags as they were upon entry + */ + spin_unlock_irqrestore(&ctx->lock, flags); + return; + +do_zombie: + PFM_DBG("context is zombie, bailing out"); + + /* always returns 0 in this case */ + __pfm_unload_context(ctx); + + /* + * keep the spinlock check happy + */ + spin_unlock(&ctx->lock); + + /* + * enable interrupt for vfree() + */ + local_irq_enable(); + + /* + * actual context free + */ + pfm_free_context(ctx); + + /* + * restore interrupts as they were upon entry + */ + local_irq_restore(flags); + + /* + * pfm_unload always successful, so can release + * session safely + */ + pfm_session_release(); +} + +/** + * pfm_free_context - de-allocate context and associated resources + * @ctx: context to free + */ +void pfm_free_context(struct pfm_context *ctx) +{ + pfm_arch_context_free(ctx); + + PFM_DBG("free ctx @0x%p", ctx); + kmem_cache_free(pfm_ctx_cachep, ctx); + /* + * decrease refcount on: + * - PMU description table + */ + pfm_pmu_release(); +} + +/** + * pfm_init_ctx -- initialize context SLAB + * + * called from pfm_init + */ +int __init pfm_init_ctx(void) +{ + pfm_ctx_cachep = kmem_cache_create("pfm_context", + sizeof(struct pfm_context)+PFM_ARCH_CTX_SIZE, + SLAB_HWCACHE_ALIGN, 0, NULL); + if (!pfm_ctx_cachep) { + PFM_ERR("cannot initialize context slab"); + return -ENOMEM; + } + return 0; +} + +/** + * pfm_ctx_permissions - check authorization to create new context + * @ctx_flags: context flags passed by user + * + * check for permissions to create a context. + * + * A sysadmin may decide to restrict creation of per-thread + * context to a group of users using the group id via + * /sys/kernel/perfmon/task_group + * + * Once we identify a user level package which can be used + * to grant/revoke Linux capabilites at login via PAM, we will + * be able to use capabilities. We would also need to increase + * the size of cap_t to support more than 32 capabilities (it + * is currently defined as u32 and 32 capabilities are alrady + * defined). + */ +static inline int pfm_ctx_permissions(u32 ctx_flags) +{ + if (pfm_controls.task_group != PFM_GROUP_PERM_ANY + && !in_group_p(pfm_controls.task_group)) { + PFM_DBG("user group not allowed to create a task context"); + return -EPERM; + } + return 0; +} + +/** + * pfm_create_initial_set - create initial set from __pfm_c reate_context + * @ctx: context to atatched the set to + */ +static void pfm_create_initial_set(struct pfm_context *ctx) +{ + struct pfm_event_set *set; + u64 *impl_pmcs; + u16 i, max_pmc; + + set = ctx->active_set; + max_pmc = ctx->regs.max_pmc; + impl_pmcs = ctx->regs.pmcs; + + /* + * install default values for all PMC registers + */ + for (i = 0; i < max_pmc; i++) { + if (pfm_arch_bv_test_bit(i, impl_pmcs)) { + set->pmcs[i] = pfm_pmu_conf->pmc_desc[i].dfl_val; + PFM_DBG("pmc%u=0x%llx", + i, + (unsigned long long)set->pmcs[i]); + } + } + /* + * PMD registers are set to 0 when the event set is allocated, + * hence we do not need to explicitly initialize them. + * + * For virtual PMD registers (i.e., those tied to a SW resource) + * their value becomes meaningful once the context is attached. + */ +} + +/** + * __pfm_create_context - allocate and initialize a perfmon context + * @ctx_flags : user context flags + * @sif: pointer to pfarg_sinfo to be updated + * @new_ctx: will contain new context address on return + * + * function used to allocate a new context. A context is allocated along + * with the default event set. If a sampling format is used, the buffer + * may be allocated and initialized. + * + * The file descriptor identifying the context is allocated and returned + * to caller. + * + * This function operates with no locks and interrupts are enabled. + * return: + * >=0: the file descriptor to identify the context + * <0 : the error code + */ +int __pfm_create_context(__u32 ctx_flags, + struct pfarg_sinfo *sif, + struct pfm_context **new_ctx) +{ + struct pfm_context *ctx; + struct file *filp = NULL; + int fd = 0, ret = -EINVAL; + + if (!pfm_pmu_conf) + return -ENOSYS; + + /* no context flags supported yet */ + if (ctx_flags) + goto error_alloc; + + ret = pfm_ctx_permissions(ctx_flags); + if (ret < 0) + goto error_alloc; + + /* + * we can use GFP_KERNEL and potentially sleep because we do + * not hold any lock at this point. + */ + might_sleep(); + ret = -ENOMEM; + ctx = kmem_cache_zalloc(pfm_ctx_cachep, GFP_KERNEL); + if (!ctx) + goto error_alloc; + + PFM_DBG("alloc ctx @0x%p", ctx); + + ctx->active_set = &ctx->_set0; + + spin_lock_init(&ctx->lock); + + /* + * context is unloaded + */ + ctx->state = PFM_CTX_UNLOADED; + + + ret = pfm_pmu_acquire(ctx); + if (ret) + goto error_file; + /* + * check if PMU is usable + */ + if (!(ctx->regs.num_pmcs && ctx->regs.num_pmcs)) { + PFM_DBG("no usable PMU registers"); + ret = -EBUSY; + goto error_file; + } + + ret = -ENFILE; + fd = pfm_alloc_fd(&filp); + if (fd < 0) + goto error_file; + + /* + * initialize arch-specific section + * must be done before fmt_init() + */ + ret = pfm_arch_context_create(ctx, ctx_flags); + if (ret) + goto error_set; + + ret = -ENOMEM; + + /* + * add initial set + */ + pfm_create_initial_set(ctx); + + filp->private_data = ctx; + + ctx->last_act = PFM_INVALID_ACTIVATION; + ctx->last_cpu = -1; + + PFM_DBG("flags=0x%x fd=%d", ctx_flags, fd); + + if (new_ctx) + *new_ctx = ctx; + + /* + * copy bitmask of available PMU registers + * + * must copy over the entire vector to avoid + * returning bogus upper bits pass by user + */ + pfm_arch_bv_copy(sif->sif_avail_pmcs, + ctx->regs.pmcs, + PFM_MAX_PMCS); + + pfm_arch_bv_copy(sif->sif_avail_pmds, + ctx->regs.pmds, + PFM_MAX_PMDS); + + /* + * we defer the fd_install until we are certain the call succeeded + * to ensure we do not have to undo its effect. Neither put_filp() + * nor put_unused_fd() undoes the effect of fd_install(). + */ + fd_install(fd, filp); + + return fd; + +error_set: + put_filp(filp); + put_unused_fd(fd); +error_file: + /* + * calls the right *_put() functions + * calls pfm_release_pmu() + */ + pfm_free_context(ctx); + return ret; +error_alloc: + return ret; +} + +/** + * pfm_undo_create -- undo context creation + * @fd: file descriptor to close + * @ctx: newly created context + * + * upon return neither fd nor ctx are useable + */ +void pfm_undo_create(int fd, struct pfm_context *ctx) +{ + struct files_struct *files = current->files; + struct file *file; + int fput_needed; + + file = fget_light(fd, &fput_needed); + /* + * there is no fd_uninstall(), so we do it + * here. put_unused_fd() does not remove the + * effect of fd_install(). + */ + + spin_lock(&files->file_lock); + files->fd_array[fd] = NULL; + spin_unlock(&files->file_lock); + + fput_light(file, fput_needed); + + /* + * decrement ref count and kill file + */ + put_filp(file); + + put_unused_fd(fd); + + pfm_free_context(ctx); +} diff --git a/perfmon/perfmon_ctxsw.c b/perfmon/perfmon_ctxsw.c new file mode 100644 index 000000000000..b1086f6dca31 --- /dev/null +++ b/perfmon/perfmon_ctxsw.c @@ -0,0 +1,252 @@ +/* + * perfmon_cxtsw.c: perfmon2 context switch code + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@gmail.com> + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include <linux/kernel.h> +#include <linux/perfmon_kern.h> +#include "perfmon_priv.h" + +void pfm_save_pmds(struct pfm_context *ctx) +{ + struct pfm_event_set *set; + u64 val, ovfl_mask; + u64 *used_pmds, *cnt_pmds; + u16 i, num; + + set = ctx->active_set; + ovfl_mask = pfm_pmu_conf->ovfl_mask; + num = set->nused_pmds; + cnt_pmds = ctx->regs.cnt_pmds; + used_pmds = set->used_pmds; + + /* + * save HW PMD, for counters, reconstruct 64-bit value + */ + for (i = 0; num; i++) { + if (pfm_arch_bv_test_bit(i, used_pmds)) { + val = pfm_read_pmd(ctx, i); + if (likely(pfm_arch_bv_test_bit(i, cnt_pmds))) + val = (set->pmds[i] & ~ovfl_mask) | + (val & ovfl_mask); + set->pmds[i] = val; + num--; + } + } +} + +/* + * interrupts are disabled (no preemption) + */ +void __pfm_ctxswin_thread(struct task_struct *task, + struct pfm_context *ctx) +{ + u64 cur_act; + struct pfm_event_set *set; + int reload_pmcs, reload_pmds; + int mycpu, is_active; + + mycpu = smp_processor_id(); + + cur_act = __get_cpu_var(pmu_activation_number); + /* + * we need to lock context because it could be accessed + * from another CPU. Normally the schedule() functions + * has masked interrupts which should be enough to + * protect against PMU interrupts. + */ + spin_lock(&ctx->lock); + + is_active = pfm_arch_is_active(ctx); + + set = ctx->active_set; + + /* + * in case fo zombie, we do not complete ctswin of the + * PMU, and we force a call to pfm_handle_work() to finish + * cleanup, i.e., free context + smpl_buff. The reason for + * deferring to pfm_handle_work() is that it is not possible + * to vfree() with interrupts disabled. + */ + if (unlikely(ctx->state == PFM_CTX_ZOMBIE)) { + pfm_post_work(task, ctx, PFM_WORK_ZOMBIE); + goto done; + } + + /* + * if we were the last user of the PMU on that CPU, + * then nothing to do except restore psr + */ + if (ctx->last_cpu == mycpu && ctx->last_act == cur_act) { + /* + * check for forced reload conditions + */ + reload_pmcs = set->priv_flags & PFM_SETFL_PRIV_MOD_PMCS; + reload_pmds = set->priv_flags & PFM_SETFL_PRIV_MOD_PMDS; + } else { +#ifndef CONFIG_SMP + pfm_check_save_prev_ctx(); +#endif + reload_pmcs = 1; + reload_pmds = 1; + } + /* consumed */ + set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH; + + if (reload_pmds) + pfm_arch_restore_pmds(ctx, set); + + /* + * need to check if had in-flight interrupt in + * pfm_ctxswout_thread(). If at least one bit set, then we must replay + * the interrupt to avoid losing some important performance data. + * + * npend_ovfls is cleared in interrupt handler + */ + if (set->npend_ovfls) + pfm_arch_resend_irq(ctx); + + if (reload_pmcs) + pfm_arch_restore_pmcs(ctx, set); + + /* + * record current activation for this context + */ + __get_cpu_var(pmu_activation_number)++; + ctx->last_cpu = mycpu; + ctx->last_act = __get_cpu_var(pmu_activation_number); + + /* + * establish new ownership. + */ + pfm_set_pmu_owner(task, ctx); + + pfm_arch_ctxswin_thread(task, ctx); +done: + spin_unlock(&ctx->lock); +} + +/* + * interrupts are masked, runqueue lock is held. + * + * In UP. we simply stop monitoring and leave the state + * in place, i.e., lazy save + */ +void __pfm_ctxswout_thread(struct task_struct *task, + struct pfm_context *ctx) +{ + int need_save_pmds, is_active; + + /* + * we need to lock context because it could be accessed + * from another CPU. Normally the schedule() functions + * has masked interrupts which should be enough to + * protect against PMU interrupts. + */ + + spin_lock(&ctx->lock); + + is_active = pfm_arch_is_active(ctx); + + /* + * stop monitoring and + * collect pending overflow information + * needed on ctxswin. We cannot afford to lose + * a PMU interrupt. + */ + need_save_pmds = pfm_arch_ctxswout_thread(task, ctx); + +#ifdef CONFIG_SMP + /* + * in SMP, release ownership of this PMU. + * PMU interrupts are masked, so nothing + * can happen. + */ + pfm_set_pmu_owner(NULL, NULL); + + /* + * On some architectures, it is necessary to read the + * PMD registers to check for pending overflow in + * pfm_arch_ctxswout_thread(). In that case, saving of + * the PMDs may be done there and not here. + */ + if (need_save_pmds) + pfm_save_pmds(ctx); +#endif + spin_unlock(&ctx->lock); +} + +/** + * pfm_ctxsw_out - save PMU state on context switch out + * @prev: thread being switched out + * @next: thread being switched in + * + * We pass the next thread as on some platforms it may be necessary to + * pass some settings from the current thread to the next + * + * Interrupts are masked + */ +void pfm_ctxsw_out(struct task_struct *prev, + struct task_struct *next) +{ + struct pfm_context *ctxp; + + ctxp = prev->pfm_context; + + if (ctxp) + __pfm_ctxswout_thread(prev, ctxp); +} + +/** + * pfm_ctxsw_in - restore PMU state on context switch in + * @prev: thread being switched out + * @next: thread being switched in + * + * We pass the prev thread as on some platforms it may be necessary to + * pass some settings from the current thread to the next + * + * Interrupts are masked + */ +void pfm_ctxsw_in(struct task_struct *prev, + struct task_struct *next) +{ + struct pfm_context *ctxn; + + ctxn = next->pfm_context; + + if (ctxn) + __pfm_ctxswin_thread(next, ctxn); + +} diff --git a/perfmon/perfmon_file.c b/perfmon/perfmon_file.c new file mode 100644 index 000000000000..12ec6b7bea73 --- /dev/null +++ b/perfmon/perfmon_file.c @@ -0,0 +1,306 @@ +/* + * perfmon_file.c: perfmon2 file input/output functions + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@hpl.hp.com> + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/poll.h> +#include <linux/file.h> +#include <linux/vfs.h> +#include <linux/mount.h> +#include <linux/perfmon_kern.h> +#include "perfmon_priv.h" + +#define PFMFS_MAGIC 0xa0b4d889 /* perfmon filesystem magic number */ + +struct pfm_controls pfm_controls = { + .task_group = PFM_GROUP_PERM_ANY, + .arg_mem_max = PAGE_SIZE, +}; + +static int __init enable_debug(char *str) +{ + pfm_controls.debug = 1; + PFM_INFO("debug output enabled\n"); + return 1; +} +__setup("perfmon_debug", enable_debug); + +static int pfmfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data, struct vfsmount *mnt) +{ + return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC, mnt); +} + +static struct file_system_type pfm_fs_type = { + .name = "pfmfs", + .get_sb = pfmfs_get_sb, + .kill_sb = kill_anon_super, +}; + +/* + * pfmfs should _never_ be mounted by userland - too much of security hassle, + * no real gain from having the whole whorehouse mounted. So we don't need + * any operations on the root directory. However, we need a non-trivial + * d_name - pfm: will go nicely and kill the special-casing in procfs. + */ +static struct vfsmount *pfmfs_mnt; + +int __init pfm_init_fs(void) +{ + int err = register_filesystem(&pfm_fs_type); + if (!err) { + pfmfs_mnt = kern_mount(&pfm_fs_type); + err = PTR_ERR(pfmfs_mnt); + if (IS_ERR(pfmfs_mnt)) + unregister_filesystem(&pfm_fs_type); + else + err = 0; + } + return err; +} + +/* + * called either on explicit close() or from exit_files(). + * Only the LAST user of the file gets to this point, i.e., it is + * called only ONCE. + * + * IMPORTANT: we get called ONLY when the refcnt on the file gets to zero + * (fput()),i.e, last task to access the file. Nobody else can access the + * file at this point. + * + * When called from exit_files(), the VMA has been freed because exit_mm() + * is executed before exit_files(). + * + * When called from exit_files(), the current task is not yet ZOMBIE but we + * flush the PMU state to the context. + */ +static int __pfm_close(struct pfm_context *ctx, struct file *filp) +{ + unsigned long flags; + int state; + int can_free = 1, can_unload = 1; + int can_release = 0; + + spin_lock_irqsave(&ctx->lock, flags); + + state = ctx->state; + + PFM_DBG("state=%d", state); + + /* + * check if unload is needed + */ + if (state == PFM_CTX_UNLOADED) + goto doit; + +#ifdef CONFIG_SMP + if (ctx->task != current) { + /* + * switch context to zombie state + */ + ctx->state = PFM_CTX_ZOMBIE; + + PFM_DBG("zombie ctx for [%d]", ctx->task->pid); + /* + * PMU session will be released by monitored task when + * it notices ZOMBIE state as part of pfm_unload_context() + */ + can_unload = can_free = 0; + } +#endif + if (can_unload) + can_release = !__pfm_unload_context(ctx); +doit: + spin_unlock_irqrestore(&ctx->lock, flags); + + if (can_release) + pfm_session_release(); + + if (can_free) + pfm_free_context(ctx); + + return 0; +} + +/* + * called either on explicit close() or from exit_files(). + * Only the LAST user of the file gets to this point, i.e., it is + * called only ONCE. + * + * IMPORTANT: we get called ONLY when the refcnt on the file gets to zero + * (fput()),i.e, last task to access the file. Nobody else can access the + * file at this point. + * + * When called from exit_files(), the VMA has been freed because exit_mm() + * is executed before exit_files(). + * + * When called from exit_files(), the current task is not yet ZOMBIE but we + * flush the PMU state to the context. + */ +static int pfm_close(struct inode *inode, struct file *filp) +{ + struct pfm_context *ctx; + + PFM_DBG("called filp=%p", filp); + + ctx = filp->private_data; + if (ctx == NULL) { + PFM_ERR("no ctx"); + return -EBADF; + } + return __pfm_close(ctx, filp); +} + +static int pfm_no_open(struct inode *irrelevant, struct file *dontcare) +{ + PFM_DBG("pfm_file_ops"); + + return -ENXIO; +} + +static unsigned int pfm_no_poll(struct file *filp, poll_table *wait) +{ + return 0; +} + +static ssize_t pfm_read(struct file *filp, char __user *buf, size_t size, + loff_t *ppos) +{ + PFM_DBG("pfm_read called"); + return -EINVAL; +} + +static ssize_t pfm_write(struct file *file, const char __user *ubuf, + size_t size, loff_t *ppos) +{ + PFM_DBG("pfm_write called"); + return -EINVAL; +} + +static int pfm_ioctl(struct inode *inode, struct file *file, unsigned int cmd, + unsigned long arg) +{ + PFM_DBG("pfm_ioctl called"); + return -EINVAL; +} + +const struct file_operations pfm_file_ops = { + .llseek = no_llseek, + .read = pfm_read, + .write = pfm_write, + .ioctl = pfm_ioctl, + .open = pfm_no_open, /* special open to disallow open via /proc */ + .release = pfm_close, + .poll = pfm_no_poll, +}; + +static int pfmfs_delete_dentry(struct dentry *dentry) +{ + return 1; +} + +static struct dentry_operations pfmfs_dentry_operations = { + .d_delete = pfmfs_delete_dentry, +}; + +int pfm_alloc_fd(struct file **cfile) +{ + int fd, ret = 0; + struct file *file = NULL; + struct inode * inode; + char name[32]; + struct qstr this; + + fd = get_unused_fd(); + if (fd < 0) + return -ENFILE; + + ret = -ENFILE; + + file = get_empty_filp(); + if (!file) + goto out; + + /* + * allocate a new inode + */ + inode = new_inode(pfmfs_mnt->mnt_sb); + if (!inode) + goto out; + + PFM_DBG("new inode ino=%ld @%p", inode->i_ino, inode); + + inode->i_sb = pfmfs_mnt->mnt_sb; + inode->i_mode = S_IFCHR|S_IRUGO; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + + sprintf(name, "[%lu]", inode->i_ino); + this.name = name; + this.hash = inode->i_ino; + this.len = strlen(name); + + ret = -ENOMEM; + + /* + * allocate a new dcache entry + */ + file->f_dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this); + if (!file->f_dentry) + goto out; + + file->f_dentry->d_op = &pfmfs_dentry_operations; + + d_add(file->f_dentry, inode); + file->f_vfsmnt = mntget(pfmfs_mnt); + file->f_mapping = inode->i_mapping; + + file->f_op = &pfm_file_ops; + file->f_mode = FMODE_READ; + file->f_flags = O_RDONLY; + file->f_pos = 0; + + *cfile = file; + + return fd; +out: + if (file) + put_filp(file); + put_unused_fd(fd); + return ret; +} diff --git a/perfmon/perfmon_init.c b/perfmon/perfmon_init.c new file mode 100644 index 000000000000..a92126d1687c --- /dev/null +++ b/perfmon/perfmon_init.c @@ -0,0 +1,87 @@ +/* + * perfmon.c: perfmon2 global initialization functions + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@hpl.hp.com> + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * More information about perfmon available at: + * http://www.hpl.hp.com/research/linux/perfmon + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include <linux/kernel.h> +#include <linux/perfmon_kern.h> +#include "perfmon_priv.h" + +/* + * external variables + */ +DEFINE_PER_CPU(struct task_struct *, pmu_owner); +DEFINE_PER_CPU(struct pfm_context *, pmu_ctx); +DEFINE_PER_CPU(u64, pmu_activation_number); + +int perfmon_disabled; /* >0 if perfmon is disabled */ + +/* + * global initialization routine, executed only once + */ +int __init pfm_init(void) +{ + PFM_LOG("version %u.%u", PFM_VERSION_MAJ, PFM_VERSION_MIN); + + if (pfm_init_ctx()) + goto error_disable; + + if (pfm_init_fs()) + goto error_disable; + + if (pfm_init_sysfs()) + goto error_disable; + + /* + * one time, arch-specific global initialization + */ + if (pfm_arch_init()) + goto error_disable; + + return 0; + +error_disable: + PFM_ERR("perfmon is disabled due to initialization error"); + perfmon_disabled = 1; + return -1; +} + +/* + * must use subsys_initcall() to ensure that the perfmon2 core + * is initialized before any PMU description module when they are + * compiled in. + */ +subsys_initcall(pfm_init); diff --git a/perfmon/perfmon_intr.c b/perfmon/perfmon_intr.c new file mode 100644 index 000000000000..d9e87bb11aa2 --- /dev/null +++ b/perfmon/perfmon_intr.c @@ -0,0 +1,295 @@ +/* + * perfmon_intr.c: perfmon2 interrupt handling + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@hpl.hp.com> + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/perfmon_kern.h> +#include "perfmon_priv.h" + +/** + * pfm_intr_process_64bit_ovfls - handle 64-bit counter emulation + * @ctx: context to operate on + * @set: set to operate on + * + * The function returns the number of 64-bit overflows detected. + * + * 64-bit software pmds are updated for overflowed pmd registers + * + * In any case, set->npend_ovfls is cleared + */ +static u16 pfm_intr_process_64bit_ovfls(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + u16 i, num_ovfls, max_pmd, max_intr; + u16 num_64b_ovfls; + u64 old_val, new_val, ovfl_mask; + + num_64b_ovfls = 0; + + ovfl_mask = pfm_pmu_conf->ovfl_mask; + max_pmd = ctx->regs.max_pmd; + max_intr = ctx->regs.max_intr_pmd; + + num_ovfls = set->npend_ovfls; + + for (i = 0; num_ovfls; i++) { + /* + * skip pmd which did not overflow + */ + if (!pfm_arch_bv_test_bit(i, set->povfl_pmds)) + continue; + + num_ovfls--; + + /* + * Update software value for counters ONLY + * + * Note that the pmd is not necessarily 0 at this point as + * qualified events may have happened before the PMU was + * frozen. The residual count is not taken into consideration + * here but will be with any read of the pmd + */ + if (likely(pfm_arch_bv_test_bit(i, ctx->regs.cnt_pmds))) { + old_val = new_val = set->pmds[i]; + new_val += 1 + ovfl_mask; + set->pmds[i] = new_val; + } else { + /* + * for non counters which interrupt, e.g., AMD IBS, + * we consider this equivalent to a 64-bit counter + * overflow. + */ + old_val = 1; new_val = 0; + } + + /* + * check for 64-bit overflow condition + */ + if (likely(old_val > new_val)) { + num_64b_ovfls++; + } else { + /* + * on some PMU, it may be necessary to re-arm the PMD + */ + pfm_arch_ovfl_reset_pmd(ctx, i); + } + + PFM_DBG_ovfl("pmd%u ovfl=%s new=0x%llx old=0x%llx " + "hw_pmd=0x%llx", + i, + old_val > new_val ? "64-bit" : "HW", + (unsigned long long)new_val, + (unsigned long long)old_val, + (unsigned long long)pfm_read_pmd(ctx, i)); + } + /* + * mark the overflows as consumed + */ + set->npend_ovfls = 0; + pfm_arch_bv_zero(set->povfl_pmds, max_intr); + + return num_64b_ovfls; +} + +/** + * pfm_overflow_handler - main overflow processing routine. + * @ctx: context to work on (always current context) + * @set: current event set + * @ip: interrupt instruction pointer + * @regs: machine state + */ +static void pfm_overflow_handler(struct pfm_context *ctx, + struct pfm_event_set *set, + unsigned long ip, + struct pt_regs *regs) +{ + /* + * skip ZOMBIE case + */ + if (unlikely(ctx->state == PFM_CTX_ZOMBIE)) + goto stop_monitoring; + + PFM_DBG_ovfl("intr_pmds=0x%llx npend=%u ip=%p u_pmds=0x%llx", + (unsigned long long)set->povfl_pmds[0], + set->npend_ovfls, + (void *)ip, + (unsigned long long)set->used_pmds[0]); + + /* + * return number of 64-bit overflows + */ + pfm_intr_process_64bit_ovfls(ctx, set); + + return; + +stop_monitoring: + /* + * Does not happen for a self-monitored context. + * We cannot attach to kernel-only thread, thus it is safe to + * set TIF bits, i.e., the thread will eventually leave the kernel + * or die and either we will catch the context and clean it up in + * pfm_handler_work() or pfm_exit_thread(). + * + * Mask until we get to pfm_handle_work() + * pfm_mask_monitoring(ctx, set); + */ + PFM_DBG_ovfl("ctx is zombie, converted to spurious"); + pfm_post_work(current, ctx, PFM_WORK_ZOMBIE); +} + +/** + * __pfm_interrupt_handler - 1st level interrupt handler + * @ip: interrupted instruction pointer + * @regs: machine state + * + * Function is static because we use a wrapper to easily capture timing infos. + * + * Context locking necessary to avoid concurrent accesses from other CPUs + */ +static void __pfm_interrupt_handler(unsigned long ip, struct pt_regs *regs) +{ + struct task_struct *task; + struct pfm_context *ctx; + struct pfm_event_set *set; + + + task = __get_cpu_var(pmu_owner); + ctx = __get_cpu_var(pmu_ctx); + + /* + * verify if there is a context on this CPU + */ + if (unlikely(ctx == NULL)) { + PFM_DBG_ovfl("no ctx"); + goto spurious; + } + + /* + * we need to lock context because it could be accessed + * from another CPU. Depending on the priority level of + * the PMU interrupt or the arch, it may be necessary to + * mask interrupts alltogether to avoid race condition with + * the timer interrupt in case of time-based set switching, + * for instance. + */ + spin_lock(&ctx->lock); + + set = ctx->active_set; + + /* + * For SMP per-thread, it is not possible to have + * owner != NULL && task != current. + * + * For UP per-thread, because of lazy save, it + * is possible to receive an interrupt in another task + * which is not using the PMU. This means + * that the interrupt was in-flight at the + * time of pfm_ctxswout_thread(). In that + * case, it will be replayed when the task + * is scheduled again. Hence we convert to spurious. + * + * The basic rule is that an overflow is always + * processed in the context of the task that + * generated it for all per-thread contexts. + */ +#ifndef CONFIG_SMP + if (unlikely((task && current->pfm_context != ctx))) { + PFM_DBG_ovfl("spurious: not owned by current task"); + goto spurious; + } +#endif + /* + * check that monitoring is active, otherwise convert + * to spurious + */ + if (unlikely(!pfm_arch_is_active(ctx))) { + PFM_DBG_ovfl("spurious: monitoring non active"); + goto spurious; + } + + /* + * freeze PMU and collect overflowed PMD registers + * into set->povfl_pmds. Number of overflowed PMDs + * reported in set->npend_ovfls + */ + pfm_arch_intr_freeze_pmu(ctx, set); + + /* + * no overflow detected, interrupt may have come + * from the previous thread running on this CPU + */ + if (unlikely(!set->npend_ovfls)) { + PFM_DBG_ovfl("no npend_ovfls"); + goto spurious; + } + + /* + * invoke actual handler + */ + pfm_overflow_handler(ctx, set, ip, regs); + + /* + * unfreeze PMU + */ + pfm_arch_intr_unfreeze_pmu(ctx); + + spin_unlock(&ctx->lock); + + return; + +spurious: + /* ctx may be NULL */ + pfm_arch_intr_unfreeze_pmu(ctx); + if (ctx) + spin_unlock(&ctx->lock); +} + + +/** + * pfm_interrupt_handler - 1st level interrupt handler + * @ip: interrupt instruction pointer + * @regs: machine state + * + * Function called from the low-level assembly code or arch-specific perfmon + * code. Simple wrapper used for timing purpose. Actual work done in + * __pfm_overflow_handler() + */ +void pfm_interrupt_handler(unsigned long ip, struct pt_regs *regs) +{ + BUG_ON(!irqs_disabled()); + __pfm_interrupt_handler(ip, regs); +} diff --git a/perfmon/perfmon_pmu.c b/perfmon/perfmon_pmu.c new file mode 100644 index 000000000000..0e44ee8530a6 --- /dev/null +++ b/perfmon/perfmon_pmu.c @@ -0,0 +1,269 @@ +/* + * perfmon_pmu.c: perfmon2 PMU configuration management + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@hpl.hp.com> + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include <linux/module.h> +#include <linux/perfmon_kern.h> +#include "perfmon_priv.h" + +#ifndef CONFIG_MODULE_UNLOAD +#define module_refcount(n) 1 +#endif + +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_pmu_conf_lock); + +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_pmu_acq_lock); +static u32 pfm_pmu_acquired; + +/* + * perfmon core must acces PMU information ONLY through pfm_pmu_conf + * if pfm_pmu_conf is NULL, then no description is registered + */ +struct pfm_pmu_config *pfm_pmu_conf; +EXPORT_SYMBOL(pfm_pmu_conf); + +/** + * pfm_pmu_regdesc_init -- initialize regdesc structure from PMU table + * @regs: the regdesc structure to initialize + * @excl_type: the register type(s) to exclude from this regdesc + * @unvail_pmcs: unavailable PMC registers + * @unavail_pmds: unavailable PMD registers + */ +static void pfm_pmu_regdesc_init(struct pfm_regdesc *regs, int excl_type, + u64 *unavail_pmcs, u64 *unavail_pmds) +{ + struct pfm_regmap_desc *d; + u16 n, n2, n_counters, i; + int max1, max2, max3; + + /* + * compute the number of implemented PMC from the + * description table + */ + n = 0; + max1 = max2 = -1; + d = pfm_pmu_conf->pmc_desc; + for (i = 0; i < pfm_pmu_conf->num_pmc_entries; i++, d++) { + if (!(d->type & PFM_REG_I)) + continue; + + if (pfm_arch_bv_test_bit(i, unavail_pmcs)) + continue; + + if (d->type & excl_type) + continue; + + pfm_arch_bv_set_bit(i, regs->pmcs); + + max1 = i; + n++; + } + + regs->max_pmc = max1 + 1; + regs->num_pmcs = n; + + n = n_counters = n2 = 0; + max1 = max2 = max3 = -1; + d = pfm_pmu_conf->pmd_desc; + for (i = 0; i < pfm_pmu_conf->num_pmd_entries; i++, d++) { + if (!(d->type & PFM_REG_I)) + continue; + + if (pfm_arch_bv_test_bit(i, unavail_pmds)) + continue; + + if (d->type & excl_type) + continue; + + pfm_arch_bv_set_bit(i, regs->pmds); + max1 = i; + n++; + + /* + * read-write registers + */ + if (!(d->type & PFM_REG_RO)) { + pfm_arch_bv_set_bit(i, regs->rw_pmds); + max3 = i; + n2++; + } + + /* + * counter registers + */ + if (d->type & PFM_REG_C64) { + pfm_arch_bv_set_bit(i, regs->cnt_pmds); + n_counters++; + } + + /* + * PMD with intr capabilities + */ + if (d->type & PFM_REG_INTR) { + pfm_arch_bv_set_bit(i, regs->intr_pmds); + max2 = i; + } + } + + regs->max_pmd = max1 + 1; + regs->max_intr_pmd = max2 + 1; + + regs->num_counters = n_counters; + regs->num_pmds = n; + regs->max_rw_pmd = max3 + 1; + regs->num_rw_pmd = n2; +} + +int pfm_pmu_register(struct pfm_pmu_config *cfg) +{ + int ret = -EBUSY; + + if (perfmon_disabled) { + PFM_INFO("perfmon disabled, cannot add PMU description"); + return -ENOSYS; + } + + spin_lock(&pfm_pmu_conf_lock); + + if (pfm_pmu_conf) + goto unlock; + + pfm_pmu_conf = cfg; + pfm_pmu_conf->ovfl_mask = (1ULL << cfg->counter_width) - 1; + + ret = pfm_sysfs_add_pmu(pfm_pmu_conf); + if (ret) + pfm_pmu_conf = NULL; + +unlock: + spin_unlock(&pfm_pmu_conf_lock); + + if (ret) + PFM_INFO("register %s PMU error %d", cfg->pmu_name, ret); + else + PFM_INFO("%s PMU installed", cfg->pmu_name); + return ret; +} + +/* + * acquire PMU resource from lower-level PMU register allocator + * (currently perfctr-watchdog.c) + * + * acquisition is done when the first context is created (and not + * when it is loaded). We grab all that is defined in the description + * module and then we make adjustments at the arch-specific level. + * + * The PMU resource is released when the last perfmon context is + * destroyed. + * + * interrupts are not masked + */ +int pfm_pmu_acquire(struct pfm_context *ctx) +{ + u64 unavail_pmcs[PFM_PMC_BV]; + u64 unavail_pmds[PFM_PMD_BV]; + int ret = 0; + + spin_lock(&pfm_pmu_acq_lock); + + PFM_DBG("pmu_acquired=%d", pfm_pmu_acquired); + + pfm_pmu_acquired++; + + if (pfm_pmu_acquired == 1) { + + memset(unavail_pmcs, 0, sizeof(unavail_pmcs)); + memset(unavail_pmds, 0, sizeof(unavail_pmds)); + + ret = pfm_arch_pmu_acquire(unavail_pmcs, unavail_pmds); + if (ret) { + pfm_pmu_acquired--; + } else { + memset(&pfm_pmu_conf->regs_all, 0, sizeof(struct pfm_regdesc)); + + pfm_pmu_regdesc_init(&pfm_pmu_conf->regs_all, 0, + unavail_pmcs, + unavail_pmds); + + PFM_DBG("regs_all.pmcs=0x%llx", + (unsigned long long)pfm_pmu_conf->regs_all.pmcs[0]); + + /* available PMU ressources */ + PFM_DBG("PMU acquired: %u PMCs, %u PMDs, %u counters", + pfm_pmu_conf->regs_all.num_pmcs, + pfm_pmu_conf->regs_all.num_pmds, + pfm_pmu_conf->regs_all.num_counters); + } + } + spin_unlock(&pfm_pmu_acq_lock); + /* + * copy global regdesc to context (for future extensions) + */ + ctx->regs = pfm_pmu_conf->regs_all; + + return ret; +} + +/* + * release the PMU resource + * + * actual release happens when last context is destroyed + * + * interrupts are not masked + */ +void pfm_pmu_release(void) +{ + BUG_ON(irqs_disabled()); + + /* + * we need to use a spinlock because release takes some time + * and we may have a race with pfm_pmu_acquire() + */ + spin_lock(&pfm_pmu_acq_lock); + + PFM_DBG("pmu_acquired=%d", pfm_pmu_acquired); + + /* + * we decouple test and decrement because if we had errors + * in pfm_pmu_acquire(), we still come here on pfm_context_free() + * but with pfm_pmu_acquire=0 + */ + if (pfm_pmu_acquired > 0 && --pfm_pmu_acquired == 0) { + pfm_arch_pmu_release(); + PFM_DBG("PMU released"); + } + spin_unlock(&pfm_pmu_acq_lock); +} diff --git a/perfmon/perfmon_priv.h b/perfmon/perfmon_priv.h new file mode 100644 index 000000000000..f1068e5ff308 --- /dev/null +++ b/perfmon/perfmon_priv.h @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@hpl.hp.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ + +#ifndef __PERFMON_PRIV_H__ +#define __PERFMON_PRIV_H__ +/* + * This file contains all the definitions of data structures, variables, macros + * that are to private to the generic code, i.e., not shared with any code that + * lives under arch/ or include/asm-XX + * + * For shared definitions, use include/linux/perfmon_kern.h + */ + +#ifdef CONFIG_PERFMON + +/* + * context lazy save/restore activation count + */ +#define PFM_INVALID_ACTIVATION ((u64)~0) + +DECLARE_PER_CPU(u64, pmu_activation_number); + +static inline void pfm_set_pmu_owner(struct task_struct *task, + struct pfm_context *ctx) +{ + __get_cpu_var(pmu_owner) = task; + __get_cpu_var(pmu_ctx) = ctx; +} + +int pfm_init_ctx(void); +int __pfm_write_pmcs(struct pfm_context *ctx, struct pfarg_pmr *req, + int count); +int __pfm_write_pmds(struct pfm_context *ctx, struct pfarg_pmr *req, + int count); +int __pfm_read_pmds(struct pfm_context *ctx, struct pfarg_pmr *req, int count); + +int pfm_session_acquire(void); +void pfm_session_release(void); + +int pfm_init_sysfs(void); + +int __pfm_create_context(__u32 ctx_flags, struct pfarg_sinfo *sif, + struct pfm_context **new_ctx); +void pfm_free_context(struct pfm_context *ctx); +void pfm_undo_create(int fd, struct pfm_context *ctx); + +int __pfm_stop(struct pfm_context *ctx); +int __pfm_start(struct pfm_context *ctx); + +int __pfm_load_context(struct pfm_context *ctx, struct task_struct *task); +int __pfm_unload_context(struct pfm_context *ctx); + +int pfm_alloc_fd(struct file **cfile); + +ssize_t pfm_sysfs_res_show(char *buf, size_t sz, int what); + +int pfm_pmu_acquire(struct pfm_context *ctx); +void pfm_pmu_release(void); + +void pfm_save_pmds(struct pfm_context *ctx); + +/* + * check_mask bitmask values for pfm_check_task_state() + */ +#define PFM_CMD_STOPPED 0x01 /* command needs thread stopped */ +#define PFM_CMD_UNLOADED 0x02 /* command needs ctx unloaded */ +#define PFM_CMD_UNLOAD 0x04 /* command is unload */ + +/** + * pfm_save_prev_ctx - check if previous context exists and save state + * + * called from pfm_load_ctx_thread() and __pfm_ctxsin_thread() to + * check if previous context exists. If so saved its PMU state. This is used + * only for UP kernels. + * + * PMU ownership is not cleared because the function is always called while + * trying to install a new owner. + */ +static inline void pfm_check_save_prev_ctx(void) +{ +#ifdef CONFIG_SMP + struct pfm_context *ctxp; + + ctxp = __get_cpu_var(pmu_ctx); + if (!ctxp) + return; + /* + * in UP per-thread, due to lazy save + * there could be a context from another + * task. We need to push it first before + * installing our new state + */ + pfm_save_pmds(ctxp); + /* + * do not clear ownership because we rewrite + * right away + */ +#endif +} + +int pfm_init_fs(void); + +static inline void pfm_post_work(struct task_struct *task, + struct pfm_context *ctx, int type) +{ + ctx->flags.work_type = type; + set_tsk_thread_flag(task, TIF_PERFMON_WORK); +} + +#define PFM_PMC_STK_ARG PFM_ARCH_PMC_STK_ARG +#define PFM_PMD_STK_ARG PFM_ARCH_PMD_STK_ARG + +#endif /* CONFIG_PERFMON */ + +#endif /* __PERFMON_PRIV_H__ */ diff --git a/perfmon/perfmon_res.c b/perfmon/perfmon_res.c new file mode 100644 index 000000000000..0af9dfa98b22 --- /dev/null +++ b/perfmon/perfmon_res.c @@ -0,0 +1,223 @@ +/* + * perfmon_res.c: perfmon2 resource allocations + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@hpl.hp.com> + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/perfmon_kern.h> +#include "perfmon_priv.h" + +/* + * global information about all sessions + */ +struct pfm_resources { + cpumask_t sys_cpumask; /* bitmask of used cpus */ + u32 thread_sessions; /* #num loaded per-thread sessions */ +}; + +static struct pfm_resources pfm_res; + +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_res_lock); + +/** + * pfm_session_acquire - reserve a per-thread session + * + * return: + * 0 : success + * -EBUSY: if conflicting session exist + */ +int pfm_session_acquire(void) +{ + unsigned long flags; + int ret = 0; + + /* + * validy checks on cpu_mask have been done upstream + */ + spin_lock_irqsave(&pfm_res_lock, flags); + + PFM_DBG("in thread=%u", + pfm_res.thread_sessions); + + pfm_res.thread_sessions++; + + PFM_DBG("out thread=%u ret=%d", + pfm_res.thread_sessions, + ret); + + spin_unlock_irqrestore(&pfm_res_lock, flags); + + return ret; +} + +/** + * pfm_session_release - release a per-thread session + * + * called from __pfm_unload_context() + */ +void pfm_session_release(void) +{ + unsigned long flags; + + spin_lock_irqsave(&pfm_res_lock, flags); + + PFM_DBG("in thread=%u", + pfm_res.thread_sessions); + + pfm_res.thread_sessions--; + + PFM_DBG("out thread=%u", + pfm_res.thread_sessions); + + spin_unlock_irqrestore(&pfm_res_lock, flags); +} + +/** + * pfm_session_allcpus_acquire - acquire per-cpu sessions on all available cpus + * + * currently used by Oprofile on X86 + */ +int pfm_session_allcpus_acquire(void) +{ + unsigned long flags; + u32 nsys_cpus, cpu; + int ret = -EBUSY; + + spin_lock_irqsave(&pfm_res_lock, flags); + + nsys_cpus = cpus_weight(pfm_res.sys_cpumask); + + PFM_DBG("in sys=%u task=%u", + nsys_cpus, + pfm_res.thread_sessions); + + if (nsys_cpus) { + PFM_DBG("already some system-wide sessions"); + goto abort; + } + + /* + * cannot mix system wide and per-task sessions + */ + if (pfm_res.thread_sessions) { + PFM_DBG("%u conflicting thread_sessions", + pfm_res.thread_sessions); + goto abort; + } + + for_each_online_cpu(cpu) { + cpu_set(cpu, pfm_res.sys_cpumask); + nsys_cpus++; + } + + PFM_DBG("out sys=%u task=%u", + nsys_cpus, + pfm_res.thread_sessions); + + ret = 0; +abort: + spin_unlock_irqrestore(&pfm_res_lock, flags); + + return ret; +} +EXPORT_SYMBOL(pfm_session_allcpus_acquire); + +/** + * pfm_session_allcpus_release - relase per-cpu sessions on all cpus + * + * currently used by Oprofile code + */ +void pfm_session_allcpus_release(void) +{ + unsigned long flags; + u32 nsys_cpus, cpu; + + spin_lock_irqsave(&pfm_res_lock, flags); + + nsys_cpus = cpus_weight(pfm_res.sys_cpumask); + + PFM_DBG("in sys=%u task=%u", + nsys_cpus, + pfm_res.thread_sessions); + + /* + * XXX: could use __cpus_clear() with nbits + */ + for_each_online_cpu(cpu) { + cpu_clear(cpu, pfm_res.sys_cpumask); + nsys_cpus--; + } + + PFM_DBG("out sys=%u task=%u", + nsys_cpus, + pfm_res.thread_sessions); + + spin_unlock_irqrestore(&pfm_res_lock, flags); +} +EXPORT_SYMBOL(pfm_session_allcpus_release); + +/** + * pfm_sysfs_res_show - return currnt resourcde usage for sysfs + * @buf: buffer to hold string in return + * @sz: size of buf + * @what: what to produce + * what=0 : thread_sessions + * what=1 : cpus_weight(sys_cpumask) + * what=2 : smpl_buf_mem_cur + * what=3 : pmu model name + * + * called from perfmon_sysfs.c + * return number of bytes written into buf (up to sz) + */ +ssize_t pfm_sysfs_res_show(char *buf, size_t sz, int what) +{ + unsigned long flags; + + spin_lock_irqsave(&pfm_res_lock, flags); + + switch (what) { + case 0: snprintf(buf, sz, "%u\n", pfm_res.thread_sessions); + break; + case 1: snprintf(buf, sz, "%d\n", cpus_weight(pfm_res.sys_cpumask)); + break; + case 3: + snprintf(buf, sz, "%s\n", + pfm_pmu_conf ? pfm_pmu_conf->pmu_name + : "unknown\n"); + } + spin_unlock_irqrestore(&pfm_res_lock, flags); + return strlen(buf); +} diff --git a/perfmon/perfmon_rw.c b/perfmon/perfmon_rw.c new file mode 100644 index 000000000000..bea77d455794 --- /dev/null +++ b/perfmon/perfmon_rw.c @@ -0,0 +1,449 @@ +/* + * perfmon.c: perfmon2 PMC/PMD read/write system calls + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@hpl.hp.com> + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * More information about perfmon available at: + * http://perfmon2.sf.net/ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/perfmon_kern.h> +#include "perfmon_priv.h" + +/** + * is_invalid -- check if register index is within limits + * @cnum: register index + * @impl: bitmask of implemented registers + * @max: highest implemented registers + 1 + * + * return: + * 0 is register index is valid + * 1 if invalid + */ +static inline int is_invalid(u16 cnum, u64 *impl, u16 max) +{ + return cnum >= max || !pfm_arch_bv_test_bit(cnum, impl); +} + +/** + * update_used_reg -- updated used_pmcs for a single PMD + * @set: set to update + * @cnum: new PMD to add + * + * This function adds the pmds and pmcs depending on PMD cnum + */ +static inline void update_used_reg(struct pfm_context *ctx, + struct pfm_event_set *set, u16 cnum) +{ + pfm_arch_bv_or(set->used_pmcs, + set->used_pmcs, + pfm_pmu_conf->pmd_desc[cnum].dep_pmcs, + ctx->regs.max_pmc); +} + +/** + * update_changes -- update nused_pmcs, nused_pmds, write newly touched pmcs + * @ctx: context to use + * @set: event set to use + * @old_used_pmcs: former used_pmc bitmask + * + * This function updates nused_pmcs and nused_pmds after the last modificiation + * to an event set. When new pmcs are used, then they must be initialized such + * that we do not pick up stale values from another session. + */ +static inline int update_changes(struct pfm_context *ctx, struct pfm_event_set *set, + u64 *old_used_pmcs) +{ + struct pfarg_pmr req; + u16 max_pmc, max_pmd; + int n, p, q, ret = 0; + + max_pmd = ctx->regs.max_pmd; + max_pmc = ctx->regs.max_pmc; + + /* + * update used counts + */ + set->nused_pmds = pfm_arch_bv_weight(set->used_pmds, max_pmd); + set->nused_pmcs = pfm_arch_bv_weight(set->used_pmcs, max_pmc); + + PFM_DBG("u_pmds=0x%llx nu_pmds=%u u_pmcs=0x%llx nu_pmcs=%u", + (unsigned long long)set->used_pmds[0], + set->nused_pmds, + (unsigned long long)set->used_pmcs[0], + set->nused_pmcs); + + memset(&req, 0, sizeof(req)); + + n = pfm_arch_bv_weight(set->used_pmcs, max_pmc); + for(p = 0; n; n--, p = q+1) { + q = pfm_arch_bv_find_next_bit(set->used_pmcs, max_pmc, p); + + if (pfm_arch_bv_test_bit(q, old_used_pmcs)) + continue; + + req.reg_num = q; + req.reg_value = set->pmcs[q]; + + ret = __pfm_write_pmcs(ctx, &req, 1); + if (ret) + break; + } + return ret; +} + +/** + * __pfm_write_pmds - modify data registers + * @ctx: context to operate on + * @req: pfarg_pmd_t request from user + * @count: number of element in the pfarg_pmd_t vector + * + * The function succeeds whether the context is attached or not. + * When attached to another thread, that thread must be stopped. + * + * The context is locked and interrupts are disabled. + */ +int __pfm_write_pmds(struct pfm_context *ctx, struct pfarg_pmr *req, int count) +{ + struct pfm_event_set *set; + u64 old_used_pmcs[PFM_PMC_BV]; + u64 value, ovfl_mask; + u64 *impl_pmds; + u16 cnum, pmd_type, max_pmd; + int i, can_access_pmu; + int ret; + pfm_pmd_check_t wr_func; + + ovfl_mask = pfm_pmu_conf->ovfl_mask; + max_pmd = ctx->regs.max_pmd; + impl_pmds = ctx->regs.pmds; + wr_func = pfm_pmu_conf->pmd_write_check; + + can_access_pmu = 0; + + /* + * we cannot access the actual PMD registers when monitoring is masked + */ + if (unlikely(ctx->state == PFM_CTX_LOADED)) + can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task; + + ret = -EINVAL; + set = ctx->active_set; + + pfm_arch_bv_copy(old_used_pmcs, set->used_pmcs, + ctx->regs.max_pmc); + + for (i = 0; i < count; i++, req++) { + + cnum = req->reg_num; + + /* + * cannot write to unexisting + * writes to read-only register are ignored + */ + if (unlikely(is_invalid(cnum, impl_pmds, max_pmd))) { + PFM_DBG("pmd%u is not available", cnum); + goto error; + } + + pmd_type = pfm_pmu_conf->pmd_desc[cnum].type; + + /* + * execute write checker, if any + */ + if (unlikely(wr_func && (pmd_type & PFM_REG_WC))) { + ret = (*wr_func)(ctx, set, req); + if (ret) + goto error; + + } + + value = req->reg_value; + + /* + * we reprogram the PMD hence, we clear any pending + * ovfl. Does affect ovfl switch on restart but new + * value has already been established here + */ + if (pfm_arch_bv_test_bit(cnum, set->povfl_pmds)) { + set->npend_ovfls--; + pfm_arch_bv_clear_bit(cnum, set->povfl_pmds); + } + + /* + * update value + */ + set->pmds[cnum] = value; + + pfm_arch_bv_set_bit(cnum, set->used_pmds); + update_used_reg(ctx, set, cnum); + + set->priv_flags |= PFM_SETFL_PRIV_MOD_PMDS; + if (can_access_pmu) + pfm_write_pmd(ctx, cnum, value); + + /* + * update number of used PMD registers + */ + set->nused_pmds = pfm_arch_bv_weight(set->used_pmds, + max_pmd); + + PFM_DBG("pmd%u=0x%llx a_pmu=%d " + "ctx_pmd=0x%llx " + " u_pmds=0x%llx nu_pmds=%u ", + cnum, + (unsigned long long)value, + can_access_pmu, + (unsigned long long)set->pmds[cnum], + (unsigned long long)set->used_pmds[0], + set->nused_pmds); + } + ret = 0; +error: + update_changes(ctx, set, old_used_pmcs); + /* + * make changes visible + */ + if (can_access_pmu) + pfm_arch_serialize(); + + return ret; +} + +/** + * __pfm_write_pmcs - modify config registers + * @ctx: context to operate on + * @req: pfarg_pmc_t request from user + * @count: number of element in the pfarg_pmc_t vector + * + * + * The function succeeds whether the context is * attached or not. + * When attached to another thread, that thread must be stopped. + * + * The context is locked and interrupts are disabled. + */ +int __pfm_write_pmcs(struct pfm_context *ctx, struct pfarg_pmr *req, int count) +{ + struct pfm_event_set *set; + u64 value, dfl_val, rsvd_msk; + u64 *impl_pmcs; + int i, can_access_pmu; + int ret; + u16 cnum, pmc_type, max_pmc; + pfm_pmc_check_t wr_func; + + wr_func = pfm_pmu_conf->pmc_write_check; + max_pmc = ctx->regs.max_pmc; + impl_pmcs = ctx->regs.pmcs; + + can_access_pmu = 0; + + /* + * we cannot access the actual PMC registers when monitoring is masked + */ + if (unlikely(ctx->state == PFM_CTX_LOADED)) + can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task; + + ret = -EINVAL; + set = ctx->active_set; + + for (i = 0; i < count; i++, req++) { + + cnum = req->reg_num; + value = req->reg_value; + + /* + * no access to unavailable PMC register + */ + if (unlikely(is_invalid(cnum, impl_pmcs, max_pmc))) { + PFM_DBG("pmc%u is not available", cnum); + goto error; + } + + pmc_type = pfm_pmu_conf->pmc_desc[cnum].type; + dfl_val = pfm_pmu_conf->pmc_desc[cnum].dfl_val; + rsvd_msk = pfm_pmu_conf->pmc_desc[cnum].rsvd_msk; + + /* + * set reserved bits to default values + * (reserved bits must be 1 in rsvd_msk) + */ + value = (value & ~rsvd_msk) | (dfl_val & rsvd_msk); + + /* + * execute write checker, if any + */ + if (likely(wr_func && (pmc_type & PFM_REG_WC))) { + req->reg_value = value; + ret = (*wr_func)(ctx, set, req); + if (ret) + goto error; + value = req->reg_value; + } + + /* + * Now we commit the changes + */ + + /* + * mark PMC register as used + * We do not track associated PMC register based on + * the fact that they will likely need to be written + * in order to become useful at which point the statement + * below will catch that. + * + * The used_pmcs bitmask is only useful on architectures where + * the PMC needs to be modified for particular bits, especially + * on overflow or to stop/start. + */ + if (!pfm_arch_bv_test_bit(cnum, set->used_pmcs)) { + pfm_arch_bv_set_bit(cnum, set->used_pmcs); + set->nused_pmcs++; + } + + set->pmcs[cnum] = value; + + set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS; + if (can_access_pmu) + pfm_arch_write_pmc(ctx, cnum, value); + + PFM_DBG("pmc%u=0x%llx a_pmu=%d " + "u_pmcs=0x%llx nu_pmcs=%u", + cnum, + (unsigned long long)value, + can_access_pmu, + (unsigned long long)set->used_pmcs[0], + set->nused_pmcs); + } + ret = 0; +error: + /* + * make sure the changes are visible + */ + if (can_access_pmu) + pfm_arch_serialize(); + + return ret; +} + +/** + * __pfm_read_pmds - read data registers + * @ctx: context to operate on + * @req: pfarg_pmd_t request from user + * @count: number of element in the pfarg_pmd_t vector + * + * + * The function succeeds whether the context is attached or not. + * When attached to another thread, that thread must be stopped. + * + * The context is locked and interrupts are disabled. + */ +int __pfm_read_pmds(struct pfm_context *ctx, struct pfarg_pmr *req, int count) +{ + u64 val = 0, ovfl_mask, hw_val; + u64 *impl_pmds; + struct pfm_event_set *set; + int i, ret, can_access_pmu = 0; + u16 cnum, pmd_type, max_pmd; + + ovfl_mask = pfm_pmu_conf->ovfl_mask; + impl_pmds = ctx->regs.pmds; + max_pmd = ctx->regs.max_pmd; + + if (likely(ctx->state == PFM_CTX_LOADED)) { + can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task; + if (can_access_pmu) + pfm_arch_serialize(); + } + + /* + * on both UP and SMP, we can only read the PMD from the hardware + * register when the task is the owner of the local PMU. + */ + ret = -EINVAL; + set = ctx->active_set; + + for (i = 0; i < count; i++, req++) { + + cnum = req->reg_num; + + if (unlikely(is_invalid(cnum, impl_pmds, max_pmd))) { + PFM_DBG("pmd%u is not implemented/unaccessible", cnum); + goto error; + } + + pmd_type = pfm_pmu_conf->pmd_desc[cnum].type; + + /* + * it is not possible to read a PMD which was not requested: + * - explicitly written via pfm_write_pmds() + * - provided as a reg_smpl_pmds[] to another PMD during + * pfm_write_pmds() + * + * This is motivated by security and for optimization purposes: + * - on context switch restore, we can restore only what + * we use (except when regs directly readable at user + * level, e.g., IA-64 self-monitoring, I386 RDPMC). + * - do not need to maintain PMC -> PMD dependencies + */ + if (unlikely(!pfm_arch_bv_test_bit(cnum, set->used_pmds))) { + PFM_DBG("pmd%u cannot read, because not used", cnum); + goto error; + } + + val = set->pmds[cnum]; + + /* + * If the task is not the current one, then we check if the + * PMU state is still in the local live register due to lazy + * ctxsw. If true, then we read directly from the registers. + */ + if (can_access_pmu) { + hw_val = pfm_read_pmd(ctx, cnum); + if (pmd_type & PFM_REG_C64) + val = (val & ~ovfl_mask) + | (hw_val & ovfl_mask); + else + val = hw_val; + } + + PFM_DBG("pmd%u=0x%llx ", + cnum, + (unsigned long long)val); + + req->reg_value = val; + } + ret = 0; +error: + return ret; +} diff --git a/perfmon/perfmon_syscalls.c b/perfmon/perfmon_syscalls.c new file mode 100644 index 000000000000..5c900bb05ad9 --- /dev/null +++ b/perfmon/perfmon_syscalls.c @@ -0,0 +1,741 @@ +/* + * perfmon_syscalls.c: perfmon2 system call interface + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@hpl.hp.com> + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/ptrace.h> +#include <linux/perfmon_kern.h> +#include <linux/uaccess.h> +#include "perfmon_priv.h" + +/* + * Context locking rules: + * --------------------- + * - any thread with access to the file descriptor of a context can + * potentially issue perfmon calls + * + * - calls must be serialized to guarantee correctness + * + * - as soon as a context is attached to a thread or CPU, it may be + * actively monitoring. On some architectures, such as IA-64, this + * is true even though the pfm_start() call has not been made. This + * comes from the fact that on some architectures, it is possible to + * start/stop monitoring from userland. + * + * - If monitoring is active, then there can PMU interrupts. Because + * context accesses must be serialized, the perfmon system calls + * must mask interrupts as soon as the context is attached. + * + * - perfmon system calls that operate with the context unloaded cannot + * assume it is actually unloaded when they are called. They first need + * to check and for that they need interrupts masked. Then, if the + * context is actually unloaded, they can unmask interrupts. + * + * - interrupt masking holds true for other internal perfmon functions as + * well. Except for PMU interrupt handler because those interrupts + * cannot be nested. + * + * - we mask ALL interrupts instead of just the PMU interrupt because we + * also need to protect against timer interrupts which could trigger + * a set switch. + */ + +struct pfm_syscall_cookie { + struct file *filp; + int fput_needed; +}; + +/* + * cannot attach if : + * - kernel task + * - task not owned by caller (checked by ptrace_may_attach()) + * - task is dead or zombie + * - cannot use blocking notification when self-monitoring + */ +static int pfm_task_incompatible(struct pfm_context *ctx, + struct task_struct *task) +{ + /* + * cannot attach to a kernel thread + */ + if (!task->mm) { + PFM_DBG("cannot attach to kernel thread [%d]", task->pid); + return -EPERM; + } + + /* + * cannot attach to a zombie task + */ + if (task->exit_state == EXIT_ZOMBIE || task->exit_state == EXIT_DEAD) { + PFM_DBG("cannot attach to zombie/dead task [%d]", task->pid); + return -EBUSY; + } + return 0; +} + +/** + * pfm_get_task -- check permission and acquire task to monitor + * @ctx: perfmon context + * @pid: identification of the task to check + * @task: upon return, a pointer to the task to monitor + * + * This function is used in per-thread mode only AND when not + * self-monitoring. It finds the task to monitor and checks + * that the caller has permissions to attach. It also checks + * that the task is stopped via ptrace so that we can safely + * modify its state. + * + * task refcount is incremented when succesful. + */ +static int pfm_get_task(struct pfm_context *ctx, pid_t pid, + struct task_struct **task) +{ + struct task_struct *p; + int ret = 0, ret1 = 0; + + /* + * When attaching to another thread we must ensure + * that the thread is actually stopped. Just like with + * perfmon system calls, we enforce that the thread + * be ptraced and STOPPED by using ptrace_check_attach(). + * + * As a consequence, only the ptracing parent can actually + * attach a context to a thread. Obviously, this constraint + * does not exist for self-monitoring threads. + * + * We use ptrace_may_access() to check for permission. + */ + read_lock(&tasklist_lock); + + p = find_task_by_vpid(pid); + if (p) + get_task_struct(p); + + read_unlock(&tasklist_lock); + + if (!p) { + PFM_DBG("task not found %d", pid); + return -ESRCH; + } + + ret = -EPERM; + + /* + * returns 0 if cannot attach + */ + ret1 = ptrace_may_access(p, PTRACE_MODE_ATTACH); + if (ret1) + ret = ptrace_check_attach(p, 0); + + PFM_DBG("may_attach=%d check_attach=%d", ret1, ret); + + if (ret || !ret1) + goto error; + + ret = pfm_task_incompatible(ctx, p); + if (ret) + goto error; + + *task = p; + + return 0; +error: + if (!(ret1 || ret)) + ret = -EPERM; + + put_task_struct(p); + + return ret; +} + +/* + * context must be locked when calling this function + */ +int __pfm_check_task_state(struct pfm_context *ctx, int check_mask, + unsigned long *flags) +{ + struct task_struct *task; + unsigned long local_flags, new_flags; + int state, ret; + +recheck: + /* + * task is NULL for system-wide context + */ + task = ctx->task; + state = ctx->state; + local_flags = *flags; + + PFM_DBG("state=%d check_mask=0x%x task=[%d]", + state, check_mask, task ? task->pid:-1); + /* + * if the context is detached, then we do not touch + * hardware, therefore there is not restriction on when we can + * access it. + */ + if (state == PFM_CTX_UNLOADED) + return 0; + /* + * no command can operate on a zombie context. + * A context becomes zombie when the file that identifies + * it is closed while the context is still attached to the + * thread it monitors. + */ + if (state == PFM_CTX_ZOMBIE) + return -EINVAL; + + /* + * at this point, state is PFM_CTX_LOADED + */ + + /* + * some commands require the context to be unloaded to operate + */ + if (check_mask & PFM_CMD_UNLOADED) { + PFM_DBG("state=%d, cmd needs context unloaded", state); + return -EBUSY; + } + + /* + * self-monitoring always ok. + */ + if (task == current) + return 0; + + /* + * at this point, monitoring another thread + */ + + /* + * When we operate on another thread, we must wait for it to be + * stopped and completely off any CPU as we need to access the + * PMU state (or machine state). + * + * A thread can be put in the STOPPED state in various ways + * including PTRACE_ATTACH, or when it receives a SIGSTOP signal. + * We enforce that the thread must be ptraced, so it is stopped + * AND it CANNOT wake up while we operate on it because this + * would require an action from the ptracing parent which is the + * thread that is calling this function. + * + * The dependency on ptrace, imposes that only the ptracing + * parent can issue command on a thread. This is unfortunate + * but we do not know of a better way of doing this. + */ + if (check_mask & PFM_CMD_STOPPED) { + + spin_unlock_irqrestore(&ctx->lock, local_flags); + + /* + * check that the thread is ptraced AND STOPPED + */ + ret = ptrace_check_attach(task, 0); + + spin_lock_irqsave(&ctx->lock, new_flags); + + /* + * flags may be different than when we released the lock + */ + *flags = new_flags; + + if (ret) + return ret; + /* + * we must recheck to verify if state has changed + */ + if (unlikely(ctx->state != state)) { + PFM_DBG("old_state=%d new_state=%d", + state, + ctx->state); + goto recheck; + } + } + return 0; +} + +int pfm_check_task_state(struct pfm_context *ctx, int check_mask, + unsigned long *flags) +{ + int ret; + ret = __pfm_check_task_state(ctx, check_mask, flags); + PFM_DBG("ret=%d",ret); + return ret; +} + +/** + * pfm_get_args - Function used to copy the syscall argument into kernel memory + * @ureq: user argument + * @sz: user argument size + * @lsz: size of stack buffer + * @laddr: stack buffer address + * @req: point to start of kernel copy of the argument + * @ptr_free: address of kernel copy to free + * + * There are two options: + * - use a stack buffer described by laddr (addresses) and lsz (size) + * - allocate memory + * + * return: + * < 0 : in case of error (ptr_free may not be updated) + * 0 : success + * - req: points to base of kernel copy of arguments + * - ptr_free: address of buffer to free by caller on exit. + * NULL if using the stack buffer + * + * when ptr_free is not NULL upon return, the caller must kfree() + */ +int pfm_get_args(void __user *ureq, size_t sz, size_t lsz, void *laddr, + void **req, void **ptr_free) +{ + void *addr; + + /* + * check syadmin argument limit + */ + if (unlikely(sz > pfm_controls.arg_mem_max)) { + PFM_DBG("argument too big %zu max=%zu", + sz, + pfm_controls.arg_mem_max); + return -E2BIG; + } + + /* + * check if vector fits on stack buffer + */ + if (sz > lsz) { + addr = kmalloc(sz, GFP_KERNEL); + if (unlikely(addr == NULL)) + return -ENOMEM; + *ptr_free = addr; + } else { + addr = laddr; + *req = laddr; + *ptr_free = NULL; + } + + /* + * bring the data in + */ + if (unlikely(copy_from_user(addr, ureq, sz))) { + if (addr != laddr) + kfree(addr); + return -EFAULT; + } + + /* + * base address of kernel buffer + */ + *req = addr; + + return 0; +} + +/** + * pfm_acquire_ctx_from_fd -- get ctx from file descriptor + * @fd: file descriptor + * @ctx: pointer to pointer of context updated on return + * @cookie: opaque structure to use for release + * + * This helper function extracts the ctx from the file descriptor. + * It also increments the refcount of the file structure. Thus + * it updates the cookie so the refcount can be decreased when + * leaving the perfmon syscall via pfm_release_ctx_from_fd + */ +static int pfm_acquire_ctx_from_fd(int fd, struct pfm_context **ctx, + struct pfm_syscall_cookie *cookie) +{ + struct file *filp; + int fput_needed; + + filp = fget_light(fd, &fput_needed); + if (unlikely(filp == NULL)) { + PFM_DBG("invalid fd %d", fd); + return -EBADF; + } + + *ctx = filp->private_data; + + if (unlikely(!*ctx || filp->f_op != &pfm_file_ops)) { + PFM_DBG("fd %d not related to perfmon", fd); + return -EBADF; + } + cookie->filp = filp; + cookie->fput_needed = fput_needed; + + return 0; +} + +/** + * pfm_release_ctx_from_fd -- decrease refcount of file associated with context + * @cookie: the cookie structure initialized by pfm_acquire_ctx_from_fd + */ +static inline void pfm_release_ctx_from_fd(struct pfm_syscall_cookie *cookie) +{ + fput_light(cookie->filp, cookie->fput_needed); +} + +/** + * pfm_validate_type_sz -- validate sz based on type + * @type : PFM_RW_XX type passed to pfm_write or pfm_read + * @sz : vector size in bytes + * + * return: + * the number of elements in the vector, 0 if error + */ +static size_t pfm_validate_type_sz(int type, size_t sz) +{ + size_t count, sz_type; + + switch(type) { + case PFM_RW_PMD: + case PFM_RW_PMC: + sz_type = sizeof(struct pfarg_pmr); + break; + default: + PFM_DBG("invalid type=%d", type); + return 0; + } + + count = sz / sz_type; + + if ((count * sz_type) != sz) { + PFM_DBG("invalid size=%zu for type=%d", sz, type); + return 0; + } + + PFM_DBG("sz=%zu sz_type=%zu count=%zu", + sz, + sz_type, + count); + + return count; +} + +/* + * unlike the other perfmon system calls, this one returns a file descriptor + * or a value < 0 in case of error, very much like open() or socket() + */ +asmlinkage long sys_pfm_create(int flags, struct pfarg_sinfo __user *ureq) +{ + struct pfm_context *new_ctx; + struct pfarg_sinfo sif; + int ret; + + PFM_DBG("flags=0x%x sif=%p", flags, ureq); + + if (perfmon_disabled) + return -ENOSYS; + + if (flags) { + PFM_DBG("no flags accepted yet"); + return -EINVAL; + } + ret = __pfm_create_context(flags, &sif, &new_ctx); + + /* + * copy sif to user level argument, if requested + */ + if (ureq && copy_to_user(ureq, &sif, sizeof(sif))) { + pfm_undo_create(ret, new_ctx); + ret = -EFAULT; + } + return ret; +} + +asmlinkage long sys_pfm_write(int fd, int uflags, + int type, + void __user *ureq, + size_t sz) +{ + u64 buf[PFM_STK_ARG]; + struct pfm_context *ctx; + struct pfm_syscall_cookie cookie; + void *req, *fptr; + unsigned long flags; + size_t count; + int ret; + + PFM_DBG("fd=%d flags=0x%x type=%d req=%p sz=%zu", + fd, uflags, type, ureq, sz); + + if (uflags) { + PFM_DBG("no flags defined"); + return -EINVAL; + } + + count = pfm_validate_type_sz(type, sz); + if (!count) + return -EINVAL; + + ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); + if (ret) + return ret; + + ret = pfm_get_args(ureq, sz, sizeof(buf), buf, (void **)&req, &fptr); + if (ret) + goto error; + + spin_lock_irqsave(&ctx->lock, flags); + + ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags); + if (ret) + goto skip; + switch(type) { + case PFM_RW_PMC: + ret = __pfm_write_pmcs(ctx, req, count); + break; + case PFM_RW_PMD: + ret = __pfm_write_pmds(ctx, req, count); + break; + default: + PFM_DBG("invalid type=%d", type); + ret = -EINVAL; + } +skip: + spin_unlock_irqrestore(&ctx->lock, flags); + + /* + * This function may be on the critical path. + * We want to avoid the branch if unecessary. + */ + if (fptr) + kfree(fptr); +error: + pfm_release_ctx_from_fd(&cookie); + return ret; +} + +asmlinkage long sys_pfm_read(int fd, int uflags, + int type, + void __user *ureq, + size_t sz) +{ + u64 buf[PFM_STK_ARG]; + struct pfm_context *ctx; + struct pfm_syscall_cookie cookie; + void *req, *fptr; + unsigned long flags; + size_t count; + int ret; + + PFM_DBG("fd=%d flags=0x%x type=%d req=%p sz=%zu", + fd, uflags, type, ureq, sz); + + if (uflags) { + PFM_DBG("no flags defined"); + return -EINVAL; + } + + count = pfm_validate_type_sz(type, sz); + if (!count) + return -EINVAL; + + ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); + if (ret) + return ret; + + ret = pfm_get_args(ureq, sz, sizeof(buf), buf, (void **)&req, &fptr); + if (ret) + goto error; + + spin_lock_irqsave(&ctx->lock, flags); + + ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags); + if (ret) + goto skip; + + switch(type) { + case PFM_RW_PMD: + ret = __pfm_read_pmds(ctx, req, count); + break; + default: + PFM_DBG("invalid type=%d", type); + ret = -EINVAL; + } +skip: + spin_unlock_irqrestore(&ctx->lock, flags); + + if (copy_to_user(ureq, req, sz)) + ret = -EFAULT; + + if (fptr) + kfree(fptr); +error: + pfm_release_ctx_from_fd(&cookie); + return ret; +} + +asmlinkage long sys_pfm_set_state(int fd, int uflags, int state) +{ + struct pfm_context *ctx; + struct pfm_syscall_cookie cookie; + unsigned long flags; + int ret; + + PFM_DBG("fd=%d uflags=0x%x state=0x%x", fd, uflags, state); + + if (uflags) { + PFM_DBG("no flags defined"); + return -EINVAL; + } + + switch(state) { + case PFM_ST_START: + case PFM_ST_STOP: + break; + default: + PFM_DBG("invalid state=0x%x", state); + return -EINVAL; + } + + ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); + if (ret) + return ret; + + spin_lock_irqsave(&ctx->lock, flags); + + ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags); + if (!ret) { + if (state == PFM_ST_STOP) + ret = __pfm_stop(ctx); + else + ret = __pfm_start(ctx); + } + + spin_unlock_irqrestore(&ctx->lock, flags); + + pfm_release_ctx_from_fd(&cookie); + + return ret; +} + +static long pfm_detach(int fd, int uflags) +{ + struct pfm_context *ctx; + struct pfm_syscall_cookie cookie; + unsigned long flags; + int ret; + + ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); + if (ret) + return ret; + + spin_lock_irqsave(&ctx->lock, flags); + + ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED|PFM_CMD_UNLOAD, &flags); + if (!ret) + ret = __pfm_unload_context(ctx); + + spin_unlock_irqrestore(&ctx->lock, flags); + + /* + * if unload was successful, then release the session + * must be called with interrupts enabled, thus we need + * to defer until are out of __pfm_unload_context() + */ + if (!ret) + pfm_session_release(); + + pfm_release_ctx_from_fd(&cookie); + + return ret; +} + +asmlinkage long sys_pfm_attach(int fd, int uflags, int target) +{ + struct pfm_context *ctx; + struct task_struct *task; + struct pfm_syscall_cookie cookie; + unsigned long flags; + int ret; + + PFM_DBG("fd=%d uflags=0x%x target=%d", fd, uflags, target); + + if (uflags) { + PFM_DBG("invalid flags"); + return -EINVAL; + } + + /* + * handle detach in a separate function + */ + if (target == PFM_NO_TARGET) + return pfm_detach(fd, uflags); + + ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); + if (ret) + return ret; + + task = current; + + /* + * in per-thread mode (not self-monitoring), get a reference + * on task to monitor. This must be done with interrupts enabled + * Upon succesful return, refcount on task has increased. + * + * fget_light() is protecting the context. + */ + if (target != current->pid) { + ret = pfm_get_task(ctx, target, &task); + if (ret) + goto error; + } + + /* + * irqsave is required to avoid race in case context is already + * loaded or with switch timeout in the case of self-monitoring + */ + spin_lock_irqsave(&ctx->lock, flags); + + ret = pfm_check_task_state(ctx, PFM_CMD_UNLOADED, &flags); + if (!ret) + ret = __pfm_load_context(ctx, task); + + spin_unlock_irqrestore(&ctx->lock, flags); + + /* + * in per-thread mode (not self-monitoring), we need + * to decrease refcount on task to monitor: + * - attach successful: we have a reference in ctx->task + * - attach failed : undo the effect of pfm_get_task() + */ + if (task != current) + put_task_struct(task); +error: + pfm_release_ctx_from_fd(&cookie); + return ret; +} diff --git a/perfmon/perfmon_sysfs.c b/perfmon/perfmon_sysfs.c new file mode 100644 index 000000000000..b13c12581175 --- /dev/null +++ b/perfmon/perfmon_sysfs.c @@ -0,0 +1,344 @@ +/* + * perfmon_sysfs.c: perfmon2 sysfs interface + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian <eranian@hpl.hp.com> + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include <linux/kernel.h> +#include <linux/module.h> /* for EXPORT_SYMBOL */ +#include <linux/perfmon_kern.h> +#include "perfmon_priv.h" + +struct pfm_attribute { + struct attribute attr; + ssize_t (*show)(void *, struct pfm_attribute *attr, char *); + ssize_t (*store)(void *, const char *, size_t); +}; +#define to_attr(n) container_of(n, struct pfm_attribute, attr); + + +#define PFM_RO_ATTR(_name, _show) \ + struct kobj_attribute attr_##_name = __ATTR(_name, 0444, _show, NULL) + +#define PFM_RW_ATTR(_name, _show, _store) \ + struct kobj_attribute attr_##_name = __ATTR(_name, 0644, _show, _store) + +#define PFM_ROS_ATTR(_name, _show) \ + struct pfm_attribute attr_##_name = __ATTR(_name, 0444, _show, NULL) + +#define is_attr_name(a, n) (!strcmp((a)->attr.name, n)) +int pfm_sysfs_add_pmu(struct pfm_pmu_config *pmu); + +static struct kobject *pfm_kernel_kobj; +static struct kobject *pfm_pmu_kobj; + + +static ssize_t pfm_regs_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ +#define to_reg(n) container_of(n, struct pfm_regmap_desc, kobj) + struct pfm_regmap_desc *reg = to_reg(kobj); + struct pfm_attribute *attribute = to_attr(attr); + return attribute->show ? attribute->show(reg, attribute, buf) : -EIO; +} + +static struct sysfs_ops pfm_regs_sysfs_ops = { + .show = pfm_regs_attr_show +}; + +static struct kobj_type pfm_regs_ktype = { + .sysfs_ops = &pfm_regs_sysfs_ops, +}; + +static ssize_t pfm_controls_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + + if (is_attr_name(attr, "version")) + return snprintf(buf, PAGE_SIZE, "%u.%u\n", PFM_VERSION_MAJ, PFM_VERSION_MIN); + + if (is_attr_name(attr, "task_sessions_count")) + return pfm_sysfs_res_show(buf, PAGE_SIZE, 0); + + if (is_attr_name(attr, "debug")) + return snprintf(buf, PAGE_SIZE, "%d\n", pfm_controls.debug); + + if (is_attr_name(attr, "task_group")) + return snprintf(buf, PAGE_SIZE, "%d\n", pfm_controls.task_group); + + if (is_attr_name(attr, "arg_mem_max")) + return snprintf(buf, PAGE_SIZE, "%zu\n", pfm_controls.arg_mem_max); + + return 0; +} + +static ssize_t pfm_controls_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + size_t d; + + if (sscanf(buf, "%zu", &d) != 1) + goto skip; + + if (is_attr_name(attr, "debug")) + pfm_controls.debug = d; + + if (is_attr_name(attr, "task_group")) + pfm_controls.task_group = d; + + if (is_attr_name(attr, "arg_mem_max")) { + /* + * we impose a page as the minimum. + * + * This limit may be smaller than the stack buffer + * available and that is fine. + */ + if (d >= PAGE_SIZE) + pfm_controls.arg_mem_max = d; + } + +skip: + return count; +} + +/* + * /sys/kernel/perfmon attributes + */ +static PFM_RO_ATTR(version, pfm_controls_show); +static PFM_RO_ATTR(task_sessions_count, pfm_controls_show); +static PFM_RW_ATTR(debug, pfm_controls_show, pfm_controls_store); +static PFM_RW_ATTR(task_group, pfm_controls_show, pfm_controls_store); +static PFM_RW_ATTR(arg_mem_max, pfm_controls_show, pfm_controls_store); + +static struct attribute *pfm_kernel_attrs[] = { + &attr_version.attr, + &attr_task_sessions_count.attr, + &attr_debug.attr, + &attr_task_group.attr, + &attr_arg_mem_max.attr, + NULL +}; + +static struct attribute_group pfm_kernel_attr_group = { + .attrs = pfm_kernel_attrs, +}; + +/* + * per-reg attributes + */ +static ssize_t pfm_reg_show(void *data, struct pfm_attribute *attr, char *buf) +{ + struct pfm_regmap_desc *reg = data; + int w; + + reg = data; + + if (is_attr_name(attr, "name")) + return snprintf(buf, PAGE_SIZE, "%s\n", reg->desc); + + if (is_attr_name(attr, "dfl_val")) + return snprintf(buf, PAGE_SIZE, "0x%llx\n", + (unsigned long long)reg->dfl_val); + + if (is_attr_name(attr, "width")) { + w = (reg->type & PFM_REG_C64) ? + pfm_pmu_conf->counter_width : 64; + return snprintf(buf, PAGE_SIZE, "%d\n", w); + } + + if (is_attr_name(attr, "rsvd_msk")) + return snprintf(buf, PAGE_SIZE, "0x%llx\n", + (unsigned long long)reg->rsvd_msk); + + if (is_attr_name(attr, "addr")) + return snprintf(buf, PAGE_SIZE, "0x%lx\n", reg->hw_addr); + + return 0; +} + +static PFM_ROS_ATTR(name, pfm_reg_show); +static PFM_ROS_ATTR(dfl_val, pfm_reg_show); +static PFM_ROS_ATTR(rsvd_msk, pfm_reg_show); +static PFM_ROS_ATTR(width, pfm_reg_show); +static PFM_ROS_ATTR(addr, pfm_reg_show); + +static struct attribute *pfm_reg_attrs[] = { + &attr_name.attr, + &attr_dfl_val.attr, + &attr_rsvd_msk.attr, + &attr_width.attr, + &attr_addr.attr, + NULL +}; + +static struct attribute_group pfm_reg_attr_group = { + .attrs = pfm_reg_attrs, +}; + +static ssize_t pfm_pmu_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + if (is_attr_name(attr, "model")) + return snprintf(buf, PAGE_SIZE, "%s\n", pfm_pmu_conf->pmu_name); + return 0; +} + +static PFM_RO_ATTR(model, pfm_pmu_show); + +static struct attribute *pfm_pmu_desc_attrs[] = { + &attr_model.attr, + NULL +}; + +static struct attribute_group pfm_pmu_desc_attr_group = { + .attrs = pfm_pmu_desc_attrs, +}; + +static int pfm_sysfs_add_pmu_regs(struct pfm_pmu_config *pmu) +{ + struct pfm_regmap_desc *reg; + unsigned int i, k; + int ret; + + reg = pmu->pmc_desc; + for (i = 0; i < pmu->num_pmc_entries; i++, reg++) { + + if (!(reg->type & PFM_REG_I)) + continue; + + ret = kobject_init_and_add(®->kobj, &pfm_regs_ktype, + pfm_pmu_kobj, "pmc%u", i); + if (ret) + goto undo_pmcs; + + ret = sysfs_create_group(®->kobj, &pfm_reg_attr_group); + if (ret) { + kobject_del(®->kobj); + goto undo_pmcs; + } + } + + reg = pmu->pmd_desc; + for (i = 0; i < pmu->num_pmd_entries; i++, reg++) { + + if (!(reg->type & PFM_REG_I)) + continue; + + ret = kobject_init_and_add(®->kobj, &pfm_regs_ktype, + pfm_pmu_kobj, "pmd%u", i); + if (ret) + goto undo_pmds; + + ret = sysfs_create_group(®->kobj, &pfm_reg_attr_group); + if (ret) { + kobject_del(®->kobj); + goto undo_pmds; + } + } + return 0; +undo_pmds: + reg = pmu->pmd_desc; + for (k = 0; k < i; k++, reg++) { + if (!(reg->type & PFM_REG_I)) + continue; + sysfs_remove_group(®->kobj, &pfm_reg_attr_group); + kobject_del(®->kobj); + } + i = pmu->num_pmc_entries; + /* fall through */ +undo_pmcs: + reg = pmu->pmc_desc; + for (k = 0; k < i; k++, reg++) { + if (!(reg->type & PFM_REG_I)) + continue; + sysfs_remove_group(®->kobj, &pfm_reg_attr_group); + kobject_del(®->kobj); + } + return ret; +} + +/* + * when a PMU description module is inserted, we create + * a pmu_desc subdir in sysfs and we populate it with + * PMU specific information, such as register mappings + */ +int pfm_sysfs_add_pmu(struct pfm_pmu_config *pmu) +{ + int ret; + + pfm_pmu_kobj = kobject_create_and_add("pmu_desc", pfm_kernel_kobj); + if (!pfm_pmu_kobj) + return -ENOMEM; + + ret = sysfs_create_group(pfm_pmu_kobj, &pfm_pmu_desc_attr_group); + if (ret) { + /* will release pfm_pmu_kobj */ + kobject_put(pfm_pmu_kobj); + return ret; + } + + ret = pfm_sysfs_add_pmu_regs(pmu); + if (ret) { + sysfs_remove_group(pfm_pmu_kobj, &pfm_pmu_desc_attr_group); + /* will release pfm_pmu_kobj */ + kobject_put(pfm_pmu_kobj); + } else + kobject_uevent(pfm_pmu_kobj, KOBJ_ADD); + + return ret; +} + +int __init pfm_init_sysfs(void) +{ + int ret; + + /* + * dynamic allocation happens on pfm_kernel_kobj, + * but a release callback is attached + */ + pfm_kernel_kobj = kobject_create_and_add("perfmon", kernel_kobj); + if (!pfm_kernel_kobj) { + PFM_ERR("cannot add kernel object"); + return -ENOMEM; + } + + ret = sysfs_create_group(pfm_kernel_kobj, &pfm_kernel_attr_group); + if (ret) { + kobject_put(pfm_kernel_kobj); + return ret; + } + + if (pfm_pmu_conf) + pfm_sysfs_add_pmu(pfm_pmu_conf); + + return 0; +} |