From 948f984df52511bb0efa5c026813b0c34de43aa0 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Tue, 20 May 2008 14:39:25 +0300
Subject: core, x86: make LIST_POISON less deadly

The list macros use LIST_POISON1 and LIST_POISON2 as undereferencable
pointers in order to trap erronous use of freed list_heads.  Unfortunately
userspace can arrange for those pointers to actually be dereferencable,
potentially turning an oops to an expolit.

To avoid this allow architectures (currently x86_64 only) to override
the default values for these pointers with truly-undereferencable values.
This is easy on x86_64 as the virtual address space is large and contains
permanently unmapped ranges.

Other 64-bit architectures will likely find similar unmapped ranges.

Signed-off-by: Avi Kivity <avi@qumranet.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig       |  5 +++++
 include/linux/poison.h | 10 ++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fe361ae7ef2f..e21cc33b794a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1021,6 +1021,11 @@ config ARCH_MEMORY_PROBE
 	def_bool X86_64
 	depends on MEMORY_HOTPLUG
 
+config ILLEGAL_POINTER_VALUE
+       hex
+       default 0 if X86_32
+       default 0xffffc10000000000 if X86_64
+
 source "mm/Kconfig"
 
 config HIGHPTE
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 9f31683728fd..0d105a56e668 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -1,14 +1,20 @@
 #ifndef _LINUX_POISON_H
 #define _LINUX_POISON_H
 
+#ifdef CONFIG_ILLEGAL_POINTER_VALUE
+#define POISON_POINTER_DELTA CONFIG_ILLEGAL_POINTER_VALUE
+#else
+#define POISON_POINTER_DELTA 0L
+#endif
+
 /********** include/linux/list.h **********/
 /*
  * These are non-NULL pointers that will result in page faults
  * under normal circumstances, used to verify that nobody uses
  * non-initialized list entries.
  */
-#define LIST_POISON1  ((void *) 0x00100100)
-#define LIST_POISON2  ((void *) 0x00200200)
+#define LIST_POISON1  ((void *) 0x00100100 + POISON_POINTER_DELTA)
+#define LIST_POISON2  ((void *) 0x00200200 + POISON_POINTER_DELTA)
 
 /********** include/linux/timer.h **********/
 /*
-- 
cgit v1.2.3


From d23db9bbd91baaa1b22e7839e1e932bc6ae0582b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 14 Jul 2008 18:03:27 +0200
Subject: x86: change LIST_POISON to 0xdead000000000000

Linus observed:

| > +       default 0xffffc10000000000 if X86_64
|
| This looks like a singularly bad pointer value on x86-64.
|
| Why not pick something that is *guaranteed* to fault? The above looks like
| any future setup that supports 41 bits of addressing and has extended the
| page tables (yes, it will happen eventually) will find that to be a
| perfectly valid address?
|
| It's also visually confusing, since it's visually very close to a real
| kernel pointer too.

Correct - so change the poison value to 0xdead000000000000.

Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f09b3e5d808d..0dbd040bd654 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1022,7 +1022,7 @@ config ARCH_MEMORY_PROBE
 config ILLEGAL_POINTER_VALUE
        hex
        default 0 if X86_32
-       default 0xffffc10000000000 if X86_64
+       default 0xdead000000000000 if X86_64
 
 source "mm/Kconfig"
 
-- 
cgit v1.2.3


From d31ec1c47b10e268794e7b739b2009ac6a19eb8b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 22 Jul 2008 09:40:43 +0200
Subject: poison-pointers: clean up, add comments

- remove stray 'L' (noticed by Andrew Morton)
- add more comments about the purpose of this change
- fix spelling (noticed by Andrew Morton)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/poison.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/include/linux/poison.h b/include/linux/poison.h
index 0d105a56e668..603d3bda31d9 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -1,13 +1,19 @@
 #ifndef _LINUX_POISON_H
 #define _LINUX_POISON_H
 
+/********** include/linux/list.h **********/
+
+/*
+ * Architectures might want to move the poison pointer offset
+ * into some well-recognized area such as 0xdead000000000000,
+ * that is also not mappable by user-space exploits:
+ */
 #ifdef CONFIG_ILLEGAL_POINTER_VALUE
-#define POISON_POINTER_DELTA CONFIG_ILLEGAL_POINTER_VALUE
+# define POISON_POINTER_DELTA CONFIG_ILLEGAL_POINTER_VALUE
 #else
-#define POISON_POINTER_DELTA 0L
+# define POISON_POINTER_DELTA 0
 #endif
 
-/********** include/linux/list.h **********/
 /*
  * These are non-NULL pointers that will result in page faults
  * under normal circumstances, used to verify that nobody uses
-- 
cgit v1.2.3


From 36b21230589ee51763afcc9ba2d126973f3ae96c Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@infradead.org>
Date: Sat, 27 Dec 2008 15:21:36 +0530
Subject: x86: eliminate atleast 10684 sparse warnings

Impact: cleanup, avoid 10000+ sparse warnings

Eliminate 1434 sparse warnings like:
include/linux/list.h:106:16: warning: constant 0xdead000000000000 is so big it is unsigned long

Eliminate 1434 sparse warnings like:
include/linux/list.h:107:16: warning: constant 0xdead000000000000 is so big it is unsigned long

Eliminate 1434 sparse warnings like:
include/linux/list.h:579:12: warning: constant 0xdead000000000000 is so big it is unsigned long

Eliminate 1434 sparse warnings like:
include/linux/list.h:580:13: warning: constant 0xdead000000000000 is so big it is unsigned long

Eliminate 1237 sparse warnings like:
include/linux/rculist.h:97:16: warning: constant 0xdead000000000000 is so big it is unsigned long

Eliminate 1237 sparse warnings like:
include/linux/rculist.h:143:14: warning: constant 0xdead000000000000 is so big it is unsigned long

Eliminate 1237 sparse warnings like:
include/linux/rculist.h:260:13: warning: constant 0xdead000000000000 is so big it is unsigned long

Eliminate 1237 sparse warnings like:
include/linux/rculist.h:280:15: warning: constant 0xdead000000000000 is so big it is unsigned long

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/poison.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/poison.h b/include/linux/poison.h
index 603d3bda31d9..f2de9a999f4a 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -9,7 +9,7 @@
  * that is also not mappable by user-space exploits:
  */
 #ifdef CONFIG_ILLEGAL_POINTER_VALUE
-# define POISON_POINTER_DELTA CONFIG_ILLEGAL_POINTER_VALUE
+# define POISON_POINTER_DELTA _AC(CONFIG_ILLEGAL_POINTER_VALUE, UL)
 #else
 # define POISON_POINTER_DELTA 0
 #endif
-- 
cgit v1.2.3


From 361c99a661a78ed22264649440e87fe4fe8da1f2 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 11 Jan 2011 20:56:53 -0200
Subject: perf evsel: Introduce perf_evlist

Killing two more perf wide global variables: nr_counters and evsel_list
as a list_head.

There are more operations that will need more fields in perf_evlist,
like the pollfd for polling all the fds in a list of evsel instances.

Use option->value to pass the evsel_list to parse_{events,filters}.

LKML-Reference: <new-submission>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile                  |  2 ++
 tools/perf/builtin-record.c          | 47 ++++++++++++++++------------
 tools/perf/builtin-stat.c            | 34 +++++++++++---------
 tools/perf/builtin-top.c             | 60 ++++++++++++++++++++----------------
 tools/perf/util/evlist.c             | 53 +++++++++++++++++++++++++++++++
 tools/perf/util/evlist.h             | 19 ++++++++++++
 tools/perf/util/header.c             | 17 +++++-----
 tools/perf/util/header.h             |  7 +++--
 tools/perf/util/include/linux/list.h |  1 +
 tools/perf/util/parse-events.c       | 51 ++++++------------------------
 tools/perf/util/parse-events.h       |  7 -----
 11 files changed, 180 insertions(+), 118 deletions(-)
 create mode 100644 tools/perf/util/evlist.c
 create mode 100644 tools/perf/util/evlist.h

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 7141c42e1469..f20bc6f85611 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -402,6 +402,7 @@ LIB_H += util/debug.h
 LIB_H += util/debugfs.h
 LIB_H += util/event.h
 LIB_H += util/evsel.h
+LIB_H += util/evlist.h
 LIB_H += util/exec_cmd.h
 LIB_H += util/types.h
 LIB_H += util/levenshtein.h
@@ -440,6 +441,7 @@ LIB_OBJS += $(OUTPUT)util/ctype.o
 LIB_OBJS += $(OUTPUT)util/debugfs.o
 LIB_OBJS += $(OUTPUT)util/environment.o
 LIB_OBJS += $(OUTPUT)util/event.o
+LIB_OBJS += $(OUTPUT)util/evlist.o
 LIB_OBJS += $(OUTPUT)util/evsel.o
 LIB_OBJS += $(OUTPUT)util/exec_cmd.o
 LIB_OBJS += $(OUTPUT)util/help.o
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index b2f729fdb317..252ace873d32 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -18,6 +18,7 @@
 
 #include "util/header.h"
 #include "util/event.h"
+#include "util/evlist.h"
 #include "util/evsel.h"
 #include "util/debug.h"
 #include "util/session.h"
@@ -66,6 +67,7 @@ static bool			sample_address			=  false;
 static bool			sample_time			=  false;
 static bool			no_buildid			=  false;
 static bool			no_buildid_cache		=  false;
+static struct perf_evlist	*evsel_list;
 
 static long			samples				=      0;
 static u64			bytes_written			=      0;
@@ -229,7 +231,8 @@ static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int n
 	return h_attr;
 }
 
-static void create_counter(struct perf_evsel *evsel, int cpu)
+static void create_counter(struct perf_evlist *evlist,
+			   struct perf_evsel *evsel, int cpu)
 {
 	char *filter = evsel->filter;
 	struct perf_event_attr *attr = &evsel->attr;
@@ -263,7 +266,7 @@ static void create_counter(struct perf_evsel *evsel, int cpu)
 
 	attr->sample_type	|= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
 
-	if (nr_counters > 1)
+	if (evlist->nr_entries > 1)
 		attr->sample_type |= PERF_SAMPLE_ID;
 
 	/*
@@ -410,7 +413,7 @@ try_again:
 
 		if (evsel->idx || thread_index) {
 			struct perf_evsel *first;
-			first = list_entry(evsel_list.next, struct perf_evsel, node);
+			first = list_entry(evlist->entries.next, struct perf_evsel, node);
 			ret = ioctl(FD(evsel, nr_cpu, thread_index),
 				    PERF_EVENT_IOC_SET_OUTPUT,
 				    FD(first, nr_cpu, 0));
@@ -449,14 +452,14 @@ try_again:
 		sample_type = attr->sample_type;
 }
 
-static void open_counters(int cpu)
+static void open_counters(struct perf_evlist *evlist, int cpu)
 {
 	struct perf_evsel *pos;
 
 	group_fd = -1;
 
-	list_for_each_entry(pos, &evsel_list, node)
-		create_counter(pos, cpu);
+	list_for_each_entry(pos, &evlist->entries, node)
+		create_counter(evlist, pos, cpu);
 
 	nr_cpu++;
 }
@@ -481,9 +484,9 @@ static void atexit_header(void)
 
 		if (!no_buildid)
 			process_buildids();
-		perf_header__write(&session->header, output, true);
+		perf_header__write(&session->header, evsel_list, output, true);
 		perf_session__delete(session);
-		perf_evsel_list__delete();
+		perf_evlist__delete(evsel_list);
 		symbol__exit();
 	}
 }
@@ -611,7 +614,7 @@ static int __cmd_record(int argc, const char **argv)
 			goto out_delete_session;
 	}
 
-	if (have_tracepoints(&evsel_list))
+	if (have_tracepoints(&evsel_list->entries))
 		perf_header__set_feat(&session->header, HEADER_TRACE_INFO);
 
 	/*
@@ -674,10 +677,10 @@ static int __cmd_record(int argc, const char **argv)
 	}
 
 	if (!system_wide && no_inherit && !cpu_list) {
-		open_counters(-1);
+		open_counters(evsel_list, -1);
 	} else {
 		for (i = 0; i < cpus->nr; i++)
-			open_counters(cpus->map[i]);
+			open_counters(evsel_list, cpus->map[i]);
 	}
 
 	perf_session__set_sample_type(session, sample_type);
@@ -687,7 +690,8 @@ static int __cmd_record(int argc, const char **argv)
 		if (err < 0)
 			return err;
 	} else if (file_new) {
-		err = perf_header__write(&session->header, output, false);
+		err = perf_header__write(&session->header, evsel_list,
+					 output, false);
 		if (err < 0)
 			return err;
 	}
@@ -712,7 +716,7 @@ static int __cmd_record(int argc, const char **argv)
 			return err;
 		}
 
-		if (have_tracepoints(&evsel_list)) {
+		if (have_tracepoints(&evsel_list->entries)) {
 			/*
 			 * FIXME err <= 0 here actually means that
 			 * there were no tracepoints so its not really
@@ -721,7 +725,7 @@ static int __cmd_record(int argc, const char **argv)
 			 * return this more properly and also
 			 * propagate errors that now are calling die()
 			 */
-			err = event__synthesize_tracing_data(output, &evsel_list,
+			err = event__synthesize_tracing_data(output, evsel_list,
 							     process_synthesized_event,
 							     session);
 			if (err <= 0) {
@@ -797,7 +801,7 @@ static int __cmd_record(int argc, const char **argv)
 			for (i = 0; i < nr_cpu; i++) {
 				struct perf_evsel *pos;
 
-				list_for_each_entry(pos, &evsel_list, node) {
+				list_for_each_entry(pos, &evsel_list->entries, node) {
 					for (thread = 0;
 						thread < threads->nr;
 						thread++)
@@ -838,10 +842,10 @@ static const char * const record_usage[] = {
 static bool force, append_file;
 
 const struct option record_options[] = {
-	OPT_CALLBACK('e', "event", NULL, "event",
+	OPT_CALLBACK('e', "event", &evsel_list, "event",
 		     "event selector. use 'perf list' to list available events",
 		     parse_events),
-	OPT_CALLBACK(0, "filter", NULL, "filter",
+	OPT_CALLBACK(0, "filter", &evsel_list, "filter",
 		     "event filter", parse_filter),
 	OPT_INTEGER('p', "pid", &target_pid,
 		    "record events on existing process id"),
@@ -892,6 +896,10 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 	int err = -ENOMEM;
 	struct perf_evsel *pos;
 
+	evsel_list = perf_evlist__new();
+	if (evsel_list == NULL)
+		return -ENOMEM;
+
 	argc = parse_options(argc, argv, record_options, record_usage,
 			    PARSE_OPT_STOP_AT_NON_OPTION);
 	if (!argc && target_pid == -1 && target_tid == -1 &&
@@ -913,7 +921,8 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 	if (no_buildid_cache || no_buildid)
 		disable_buildid_cache();
 
-	if (list_empty(&evsel_list) && perf_evsel_list__create_default() < 0) {
+	if (evsel_list->nr_entries == 0 &&
+	    perf_evlist__add_default(evsel_list) < 0) {
 		pr_err("Not enough memory for event selector list\n");
 		goto out_symbol_exit;
 	}
@@ -933,7 +942,7 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 		return -1;
 	}
 
-	list_for_each_entry(pos, &evsel_list, node) {
+	list_for_each_entry(pos, &evsel_list->entries, node) {
 		if (perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0)
 			goto out_free_fd;
 		if (perf_header__push_event(pos->attr.config, event_name(pos)))
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index a482a191a0ca..da9090245934 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -43,6 +43,7 @@
 #include "util/parse-options.h"
 #include "util/parse-events.h"
 #include "util/event.h"
+#include "util/evlist.h"
 #include "util/evsel.h"
 #include "util/debug.h"
 #include "util/header.h"
@@ -71,6 +72,8 @@ static struct perf_event_attr default_attrs[] = {
 
 };
 
+struct perf_evlist		*evsel_list;
+
 static bool			system_wide			=  false;
 static struct cpu_map		*cpus;
 static int			run_idx				=  0;
@@ -309,7 +312,7 @@ static int run_perf_stat(int argc __used, const char **argv)
 		close(child_ready_pipe[0]);
 	}
 
-	list_for_each_entry(counter, &evsel_list, node) {
+	list_for_each_entry(counter, &evsel_list->entries, node) {
 		if (create_perf_stat_counter(counter) < 0) {
 			if (errno == -EPERM || errno == -EACCES) {
 				error("You may not have permission to collect %sstats.\n"
@@ -347,12 +350,12 @@ static int run_perf_stat(int argc __used, const char **argv)
 	update_stats(&walltime_nsecs_stats, t1 - t0);
 
 	if (no_aggr) {
-		list_for_each_entry(counter, &evsel_list, node) {
+		list_for_each_entry(counter, &evsel_list->entries, node) {
 			read_counter(counter);
 			perf_evsel__close_fd(counter, cpus->nr, 1);
 		}
 	} else {
-		list_for_each_entry(counter, &evsel_list, node) {
+		list_for_each_entry(counter, &evsel_list->entries, node) {
 			read_counter_aggr(counter);
 			perf_evsel__close_fd(counter, cpus->nr, threads->nr);
 		}
@@ -555,10 +558,10 @@ static void print_stat(int argc, const char **argv)
 	}
 
 	if (no_aggr) {
-		list_for_each_entry(counter, &evsel_list, node)
+		list_for_each_entry(counter, &evsel_list->entries, node)
 			print_counter(counter);
 	} else {
-		list_for_each_entry(counter, &evsel_list, node)
+		list_for_each_entry(counter, &evsel_list->entries, node)
 			print_counter_aggr(counter);
 	}
 
@@ -610,7 +613,7 @@ static int stat__set_big_num(const struct option *opt __used,
 }
 
 static const struct option options[] = {
-	OPT_CALLBACK('e', "event", NULL, "event",
+	OPT_CALLBACK('e', "event", &evsel_list, "event",
 		     "event selector. use 'perf list' to list available events",
 		     parse_events),
 	OPT_BOOLEAN('i', "no-inherit", &no_inherit,
@@ -648,6 +651,10 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
 
 	setlocale(LC_ALL, "");
 
+	evsel_list = perf_evlist__new();
+	if (evsel_list == NULL)
+		return -ENOMEM;
+
 	argc = parse_options(argc, argv, options, stat_usage,
 		PARSE_OPT_STOP_AT_NON_OPTION);
 
@@ -679,17 +686,14 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
 		usage_with_options(stat_usage, options);
 
 	/* Set attrs and nr_counters if no event is selected and !null_run */
-	if (!null_run && !nr_counters) {
+	if (!null_run && !evsel_list->nr_entries) {
 		size_t c;
 
-		nr_counters = ARRAY_SIZE(default_attrs);
-
 		for (c = 0; c < ARRAY_SIZE(default_attrs); ++c) {
-			pos = perf_evsel__new(&default_attrs[c],
-					      nr_counters);
+			pos = perf_evsel__new(&default_attrs[c], c);
 			if (pos == NULL)
 				goto out;
-			list_add(&pos->node, &evsel_list);
+			perf_evlist__add(evsel_list, pos);
 		}
 	}
 
@@ -713,7 +717,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
 		return -1;
 	}
 
-	list_for_each_entry(pos, &evsel_list, node) {
+	list_for_each_entry(pos, &evsel_list->entries, node) {
 		if (perf_evsel__alloc_stat_priv(pos) < 0 ||
 		    perf_evsel__alloc_counts(pos, cpus->nr) < 0 ||
 		    perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0)
@@ -741,9 +745,9 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
 	if (status != -1)
 		print_stat(argc, argv);
 out_free_fd:
-	list_for_each_entry(pos, &evsel_list, node)
+	list_for_each_entry(pos, &evsel_list->entries, node)
 		perf_evsel__free_stat_priv(pos);
-	perf_evsel_list__delete();
+	perf_evlist__delete(evsel_list);
 out:
 	thread_map__delete(threads);
 	threads = NULL;
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index b6998e055767..216b62ed4b89 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -21,6 +21,7 @@
 #include "perf.h"
 
 #include "util/color.h"
+#include "util/evlist.h"
 #include "util/evsel.h"
 #include "util/session.h"
 #include "util/symbol.h"
@@ -60,6 +61,8 @@
 
 #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
 
+struct perf_evlist		*evsel_list;
+
 static bool			system_wide			=  false;
 
 static int			default_interval		=      0;
@@ -267,7 +270,7 @@ static void __zero_source_counters(struct sym_entry *syme)
 
 	line = syme->src->lines;
 	while (line) {
-		for (i = 0; i < nr_counters; i++)
+		for (i = 0; i < evsel_list->nr_entries; i++)
 			line->count[i] = 0;
 		line = line->next;
 	}
@@ -414,7 +417,7 @@ static double sym_weight(const struct sym_entry *sym)
 	if (!display_weighted)
 		return weight;
 
-	for (counter = 1; counter < nr_counters-1; counter++)
+	for (counter = 1; counter < evsel_list->nr_entries - 1; counter++)
 		weight *= sym->count[counter];
 
 	weight /= (sym->count[counter] + 1);
@@ -501,7 +504,7 @@ static void print_sym_table(void)
 			rb_insert_active_sym(&tmp, syme);
 			sum_ksamples += syme->snap_count;
 
-			for (j = 0; j < nr_counters; j++)
+			for (j = 0; j < evsel_list->nr_entries; j++)
 				syme->count[j] = zero ? 0 : syme->count[j] * 7 / 8;
 		} else
 			list_remove_active_sym(syme);
@@ -535,9 +538,9 @@ static void print_sym_table(void)
 			esamples_percent);
 	}
 
-	if (nr_counters == 1 || !display_weighted) {
+	if (evsel_list->nr_entries == 1 || !display_weighted) {
 		struct perf_evsel *first;
-		first = list_entry(evsel_list.next, struct perf_evsel, node);
+		first = list_entry(evsel_list->entries.next, struct perf_evsel, node);
 		printf("%" PRIu64, (uint64_t)first->attr.sample_period);
 		if (freq)
 			printf("Hz ");
@@ -547,7 +550,7 @@ static void print_sym_table(void)
 
 	if (!display_weighted)
 		printf("%s", event_name(sym_evsel));
-	else list_for_each_entry(counter, &evsel_list, node) {
+	else list_for_each_entry(counter, &evsel_list->entries, node) {
 		if (counter->idx)
 			printf("/");
 
@@ -606,7 +609,7 @@ static void print_sym_table(void)
 			sym_width = winsize.ws_col - dso_width - 29;
 	}
 	putchar('\n');
-	if (nr_counters == 1)
+	if (evsel_list->nr_entries == 1)
 		printf("             samples  pcnt");
 	else
 		printf("   weight    samples  pcnt");
@@ -615,7 +618,7 @@ static void print_sym_table(void)
 		printf("         RIP       ");
 	printf(" %-*.*s DSO\n", sym_width, sym_width, "function");
 	printf("   %s    _______ _____",
-	       nr_counters == 1 ? "      " : "______");
+	       evsel_list->nr_entries == 1 ? "      " : "______");
 	if (verbose)
 		printf(" ________________");
 	printf(" %-*.*s", sym_width, sym_width, graph_line);
@@ -634,7 +637,7 @@ static void print_sym_table(void)
 		pcnt = 100.0 - (100.0 * ((sum_ksamples - syme->snap_count) /
 					 sum_ksamples));
 
-		if (nr_counters == 1 || !display_weighted)
+		if (evsel_list->nr_entries == 1 || !display_weighted)
 			printf("%20.2f ", syme->weight);
 		else
 			printf("%9.1f %10ld ", syme->weight, syme->snap_count);
@@ -744,7 +747,7 @@ static void print_mapped_keys(void)
 	fprintf(stdout, "\t[d]     display refresh delay.             \t(%d)\n", delay_secs);
 	fprintf(stdout, "\t[e]     display entries (lines).           \t(%d)\n", print_entries);
 
-	if (nr_counters > 1)
+	if (evsel_list->nr_entries > 1)
 		fprintf(stdout, "\t[E]     active event counter.              \t(%s)\n", event_name(sym_evsel));
 
 	fprintf(stdout, "\t[f]     profile display filter (count).    \t(%d)\n", count_filter);
@@ -753,7 +756,7 @@ static void print_mapped_keys(void)
 	fprintf(stdout, "\t[s]     annotate symbol.                   \t(%s)\n", name?: "NULL");
 	fprintf(stdout, "\t[S]     stop annotation.\n");
 
-	if (nr_counters > 1)
+	if (evsel_list->nr_entries > 1)
 		fprintf(stdout, "\t[w]     toggle display weighted/count[E]r. \t(%d)\n", display_weighted ? 1 : 0);
 
 	fprintf(stdout,
@@ -783,7 +786,7 @@ static int key_mapped(int c)
 			return 1;
 		case 'E':
 		case 'w':
-			return nr_counters > 1 ? 1 : 0;
+			return evsel_list->nr_entries > 1 ? 1 : 0;
 		default:
 			break;
 	}
@@ -831,22 +834,22 @@ static void handle_keypress(struct perf_session *session, int c)
 				signal(SIGWINCH, SIG_DFL);
 			break;
 		case 'E':
-			if (nr_counters > 1) {
+			if (evsel_list->nr_entries > 1) {
 				fprintf(stderr, "\nAvailable events:");
 
-				list_for_each_entry(sym_evsel, &evsel_list, node)
+				list_for_each_entry(sym_evsel, &evsel_list->entries, node)
 					fprintf(stderr, "\n\t%d %s", sym_evsel->idx, event_name(sym_evsel));
 
 				prompt_integer(&sym_counter, "Enter details event counter");
 
-				if (sym_counter >= nr_counters) {
-					sym_evsel = list_entry(evsel_list.next, struct perf_evsel, node);
+				if (sym_counter >= evsel_list->nr_entries) {
+					sym_evsel = list_entry(evsel_list->entries.next, struct perf_evsel, node);
 					sym_counter = 0;
 					fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(sym_evsel));
 					sleep(1);
 					break;
 				}
-				list_for_each_entry(sym_evsel, &evsel_list, node)
+				list_for_each_entry(sym_evsel, &evsel_list->entries, node)
 					if (sym_evsel->idx == sym_counter)
 						break;
 			} else sym_counter = 0;
@@ -1198,7 +1201,7 @@ static void perf_session__mmap_read(struct perf_session *self)
 	int i, thread_index;
 
 	for (i = 0; i < cpus->nr; i++) {
-		list_for_each_entry(counter, &evsel_list, node) {
+		list_for_each_entry(counter, &evsel_list->entries, node) {
 			for (thread_index = 0;
 				thread_index < threads->nr;
 				thread_index++) {
@@ -1312,7 +1315,7 @@ static int __cmd_top(void)
 
 	for (i = 0; i < cpus->nr; i++) {
 		group_fd = -1;
-		list_for_each_entry(counter, &evsel_list, node)
+		list_for_each_entry(counter, &evsel_list->entries, node)
 			start_counter(i, counter);
 	}
 
@@ -1354,7 +1357,7 @@ static const char * const top_usage[] = {
 };
 
 static const struct option options[] = {
-	OPT_CALLBACK('e', "event", NULL, "event",
+	OPT_CALLBACK('e', "event", &evsel_list, "event",
 		     "event selector. use 'perf list' to list available events",
 		     parse_events),
 	OPT_INTEGER('c', "count", &default_interval,
@@ -1404,6 +1407,10 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 	struct perf_evsel *pos;
 	int status = -ENOMEM;
 
+	evsel_list = perf_evlist__new();
+	if (evsel_list == NULL)
+		return -ENOMEM;
+
 	page_size = sysconf(_SC_PAGE_SIZE);
 
 	argc = parse_options(argc, argv, options, top_usage, 0);
@@ -1431,7 +1438,8 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 		cpu_list = NULL;
 	}
 
-	if (!nr_counters && perf_evsel_list__create_default() < 0) {
+	if (!evsel_list->nr_entries &&
+	    perf_evlist__add_default(evsel_list) < 0) {
 		pr_err("Not enough memory for event selector list\n");
 		return -ENOMEM;
 	}
@@ -1459,7 +1467,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 	if (cpus == NULL)
 		usage_with_options(top_usage, options);
 
-	list_for_each_entry(pos, &evsel_list, node) {
+	list_for_each_entry(pos, &evsel_list->entries, node) {
 		if (perf_evsel__alloc_mmap_per_thread(pos, cpus->nr, threads->nr) < 0 ||
 		    perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0)
 			goto out_free_fd;
@@ -1472,10 +1480,10 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 		pos->attr.sample_period = default_interval;
 	}
 
-	sym_evsel = list_entry(evsel_list.next, struct perf_evsel, node);
+	sym_evsel = list_entry(evsel_list->entries.next, struct perf_evsel, node);
 
 	symbol_conf.priv_size = (sizeof(struct sym_entry) +
-				 (nr_counters + 1) * sizeof(unsigned long));
+				 (evsel_list->nr_entries + 1) * sizeof(unsigned long));
 
 	symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL);
 	if (symbol__init() < 0)
@@ -1489,9 +1497,9 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 
 	status = __cmd_top();
 out_free_fd:
-	list_for_each_entry(pos, &evsel_list, node)
+	list_for_each_entry(pos, &evsel_list->entries, node)
 		perf_evsel__free_mmap(pos);
-	perf_evsel_list__delete();
+	perf_evlist__delete(evsel_list);
 
 	return status;
 }
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
new file mode 100644
index 000000000000..7b4faec23737
--- /dev/null
+++ b/tools/perf/util/evlist.c
@@ -0,0 +1,53 @@
+#include "evlist.h"
+#include "evsel.h"
+#include "util.h"
+
+struct perf_evlist *perf_evlist__new(void)
+{
+	struct perf_evlist *evlist = zalloc(sizeof(*evlist));
+
+	if (evlist != NULL) {
+		INIT_LIST_HEAD(&evlist->entries);
+	}
+
+	return evlist;
+}
+
+static void perf_evlist__purge(struct perf_evlist *evlist)
+{
+	struct perf_evsel *pos, *n;
+
+	list_for_each_entry_safe(pos, n, &evlist->entries, node) {
+		list_del_init(&pos->node);
+		perf_evsel__delete(pos);
+	}
+
+	evlist->nr_entries = 0;
+}
+
+void perf_evlist__delete(struct perf_evlist *evlist)
+{
+	perf_evlist__purge(evlist);
+	free(evlist);
+}
+
+void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry)
+{
+	list_add_tail(&entry->node, &evlist->entries);
+	++evlist->nr_entries;
+}
+
+int perf_evlist__add_default(struct perf_evlist *evlist)
+{
+	struct perf_event_attr attr = {
+		.type = PERF_TYPE_HARDWARE,
+		.config = PERF_COUNT_HW_CPU_CYCLES,
+	};
+	struct perf_evsel *evsel = perf_evsel__new(&attr, 0);
+
+	if (evsel == NULL)
+		return -ENOMEM;
+
+	perf_evlist__add(evlist, evsel);
+	return 0;
+}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
new file mode 100644
index 000000000000..48db91a8abf3
--- /dev/null
+++ b/tools/perf/util/evlist.h
@@ -0,0 +1,19 @@
+#ifndef __PERF_EVLIST_H
+#define __PERF_EVLIST_H 1
+
+#include <linux/list.h>
+
+struct perf_evlist {
+	struct list_head entries;
+	int		 nr_entries;
+};
+
+struct perf_evsel;
+
+struct perf_evlist *perf_evlist__new(void);
+void perf_evlist__delete(struct perf_evlist *evlist);
+
+void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry);
+int perf_evlist__add_default(struct perf_evlist *evlist);
+
+#endif /* __PERF_EVLIST_H */
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index f6a929e74981..f0138d472339 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -8,6 +8,7 @@
 #include <linux/list.h>
 #include <linux/kernel.h>
 
+#include "evlist.h"
 #include "util.h"
 #include "header.h"
 #include "../perf.h"
@@ -428,7 +429,8 @@ static bool perf_session__read_build_ids(struct perf_session *self, bool with_hi
 	return ret;
 }
 
-static int perf_header__adds_write(struct perf_header *self, int fd)
+static int perf_header__adds_write(struct perf_header *self,
+				   struct perf_evlist *evlist, int fd)
 {
 	int nr_sections;
 	struct perf_session *session;
@@ -463,7 +465,7 @@ static int perf_header__adds_write(struct perf_header *self, int fd)
 
 		/* Write trace info */
 		trace_sec->offset = lseek(fd, 0, SEEK_CUR);
-		read_tracing_data(fd, &evsel_list);
+		read_tracing_data(fd, &evlist->entries);
 		trace_sec->size = lseek(fd, 0, SEEK_CUR) - trace_sec->offset;
 	}
 
@@ -513,7 +515,8 @@ int perf_header__write_pipe(int fd)
 	return 0;
 }
 
-int perf_header__write(struct perf_header *self, int fd, bool at_exit)
+int perf_header__write(struct perf_header *self, struct perf_evlist *evlist,
+		       int fd, bool at_exit)
 {
 	struct perf_file_header f_header;
 	struct perf_file_attr   f_attr;
@@ -566,7 +569,7 @@ int perf_header__write(struct perf_header *self, int fd, bool at_exit)
 	self->data_offset = lseek(fd, 0, SEEK_CUR);
 
 	if (at_exit) {
-		err = perf_header__adds_write(self, fd);
+		err = perf_header__adds_write(self, evlist, fd);
 		if (err < 0)
 			return err;
 	}
@@ -1133,7 +1136,7 @@ int event__process_event_type(event_t *self,
 	return 0;
 }
 
-int event__synthesize_tracing_data(int fd, struct list_head *pattrs,
+int event__synthesize_tracing_data(int fd, struct perf_evlist *evlist,
 				   event__handler_t process,
 				   struct perf_session *session __unused)
 {
@@ -1144,7 +1147,7 @@ int event__synthesize_tracing_data(int fd, struct list_head *pattrs,
 	memset(&ev, 0, sizeof(ev));
 
 	ev.tracing_data.header.type = PERF_RECORD_HEADER_TRACING_DATA;
-	size = read_tracing_data_size(fd, pattrs);
+	size = read_tracing_data_size(fd, &evlist->entries);
 	if (size <= 0)
 		return size;
 	aligned_size = ALIGN(size, sizeof(u64));
@@ -1154,7 +1157,7 @@ int event__synthesize_tracing_data(int fd, struct list_head *pattrs,
 
 	process(&ev, NULL, session);
 
-	err = read_tracing_data(fd, pattrs);
+	err = read_tracing_data(fd, &evlist->entries);
 	write_padded(fd, NULL, 0, padding);
 
 	return aligned_size;
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 33f16be7b72f..65afd7f74e0d 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -65,8 +65,11 @@ struct perf_header {
 int perf_header__init(struct perf_header *self);
 void perf_header__exit(struct perf_header *self);
 
+struct perf_evlist;
+
 int perf_header__read(struct perf_session *session, int fd);
-int perf_header__write(struct perf_header *self, int fd, bool at_exit);
+int perf_header__write(struct perf_header *self, struct perf_evlist *evlist,
+		       int fd, bool at_exit);
 int perf_header__write_pipe(int fd);
 
 int perf_header__add_attr(struct perf_header *self,
@@ -113,7 +116,7 @@ int event__synthesize_event_types(event__handler_t process,
 int event__process_event_type(event_t *self,
 			      struct perf_session *session);
 
-int event__synthesize_tracing_data(int fd, struct list_head *pattrs,
+int event__synthesize_tracing_data(int fd, struct perf_evlist *evlist,
 				   event__handler_t process,
 				   struct perf_session *session);
 int event__process_tracing_data(event_t *self,
diff --git a/tools/perf/util/include/linux/list.h b/tools/perf/util/include/linux/list.h
index f5ca26e53fbb..356c7e467b83 100644
--- a/tools/perf/util/include/linux/list.h
+++ b/tools/perf/util/include/linux/list.h
@@ -1,3 +1,4 @@
+#include <linux/kernel.h>
 #include "../../../../include/linux/list.h"
 
 #ifndef PERF_LIST_H
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 135f69baf966..d3086cecd2dd 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1,6 +1,7 @@
 #include "../../../include/linux/hw_breakpoint.h"
 #include "util.h"
 #include "../perf.h"
+#include "evlist.h"
 #include "evsel.h"
 #include "parse-options.h"
 #include "parse-events.h"
@@ -11,10 +12,6 @@
 #include "header.h"
 #include "debugfs.h"
 
-int				nr_counters;
-
-LIST_HEAD(evsel_list);
-
 struct event_symbol {
 	u8		type;
 	u64		config;
@@ -778,8 +775,9 @@ modifier:
 	return ret;
 }
 
-int parse_events(const struct option *opt __used, const char *str, int unset __used)
+int parse_events(const struct option *opt, const char *str, int unset __used)
 {
+	struct perf_evlist *evlist = *(struct perf_evlist **)opt->value;
 	struct perf_event_attr attr;
 	enum event_result ret;
 
@@ -794,12 +792,10 @@ int parse_events(const struct option *opt __used, const char *str, int unset __u
 
 		if (ret != EVT_HANDLED_ALL) {
 			struct perf_evsel *evsel;
-			evsel = perf_evsel__new(&attr,
-						nr_counters);
+			evsel = perf_evsel__new(&attr, evlist->nr_entries);
 			if (evsel == NULL)
 				return -1;
-			list_add_tail(&evsel->node, &evsel_list);
-			++nr_counters;
+			perf_evlist__add(evlist, evsel);
 		}
 
 		if (*str == 0)
@@ -813,13 +809,14 @@ int parse_events(const struct option *opt __used, const char *str, int unset __u
 	return 0;
 }
 
-int parse_filter(const struct option *opt __used, const char *str,
+int parse_filter(const struct option *opt, const char *str,
 		 int unset __used)
 {
+	struct perf_evlist *evlist = *(struct perf_evlist **)opt->value;
 	struct perf_evsel *last = NULL;
 
-	if (!list_empty(&evsel_list))
-		last = list_entry(evsel_list.prev, struct perf_evsel, node);
+	if (evlist->nr_entries > 0)
+		last = list_entry(evlist->entries.prev, struct perf_evsel, node);
 
 	if (last == NULL || last->attr.type != PERF_TYPE_TRACEPOINT) {
 		fprintf(stderr,
@@ -981,33 +978,3 @@ void print_events(void)
 
 	exit(129);
 }
-
-int perf_evsel_list__create_default(void)
-{
-	struct perf_evsel *evsel;
-	struct perf_event_attr attr;
-
-	memset(&attr, 0, sizeof(attr));
-	attr.type = PERF_TYPE_HARDWARE;
-	attr.config = PERF_COUNT_HW_CPU_CYCLES;
-
-	evsel = perf_evsel__new(&attr, 0);
-
-	if (evsel == NULL)
-		return -ENOMEM;
-
-	list_add(&evsel->node, &evsel_list);
-	++nr_counters;
-	return 0;
-}
-
-void perf_evsel_list__delete(void)
-{
-	struct perf_evsel *pos, *n;
-
-	list_for_each_entry_safe(pos, n, &evsel_list, node) {
-		list_del_init(&pos->node);
-		perf_evsel__delete(pos);
-	}
-	nr_counters = 0;
-}
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index 458e3ecf17af..cf7e94abb676 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -9,11 +9,6 @@
 struct list_head;
 struct perf_evsel;
 
-extern struct list_head evsel_list;
-
-int perf_evsel_list__create_default(void);
-void perf_evsel_list__delete(void);
-
 struct option;
 
 struct tracepoint_path {
@@ -25,8 +20,6 @@ struct tracepoint_path {
 extern struct tracepoint_path *tracepoint_id_to_path(u64 config);
 extern bool have_tracepoints(struct list_head *evlist);
 
-extern int			nr_counters;
-
 const char *event_name(struct perf_evsel *event);
 extern const char *__event_name(int type, u64 config);
 
-- 
cgit v1.2.3


From 5c581041cf97aa7980b442de81ddea8273d6dcde Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 11 Jan 2011 22:30:02 -0200
Subject: perf evlist: Adopt the pollfd array

Allocating just the space needed for nr_cpus * nr_threads * nr_evsels,
not the MAX_NR_CPUS and counters.

LKML-Reference: <new-submission>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-record.c | 20 +++++++-------------
 tools/perf/builtin-top.c    | 26 +++++++++++---------------
 tools/perf/util/evlist.c    |  9 +++++++++
 tools/perf/util/evlist.h    |  6 ++++++
 4 files changed, 33 insertions(+), 28 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 252ace873d32..1614d89b4765 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -72,9 +72,6 @@ static struct perf_evlist	*evsel_list;
 static long			samples				=      0;
 static u64			bytes_written			=      0;
 
-static struct pollfd		*event_array;
-
-static int			nr_poll				=      0;
 static int			nr_cpu				=      0;
 
 static int			file_new			=      1;
@@ -432,9 +429,9 @@ try_again:
 				exit(-1);
 			}
 
-			event_array[nr_poll].fd = FD(evsel, nr_cpu, thread_index);
-			event_array[nr_poll].events = POLLIN;
-			nr_poll++;
+			evlist->pollfd[evlist->nr_fds].fd = FD(evsel, nr_cpu, thread_index);
+			evlist->pollfd[evlist->nr_fds].events = POLLIN;
+			evlist->nr_fds++;
 		}
 
 		if (filter != NULL) {
@@ -793,7 +790,7 @@ static int __cmd_record(int argc, const char **argv)
 		if (hits == samples) {
 			if (done)
 				break;
-			err = poll(event_array, nr_poll, -1);
+			err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
 			waking++;
 		}
 
@@ -948,9 +945,8 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 		if (perf_header__push_event(pos->attr.config, event_name(pos)))
 			goto out_free_fd;
 	}
-	event_array = malloc((sizeof(struct pollfd) * MAX_NR_CPUS *
-			      MAX_COUNTERS * threads->nr));
-	if (!event_array)
+
+	if (perf_evlist__alloc_pollfd(evsel_list, cpus->nr, threads->nr) < 0)
 		goto out_free_fd;
 
 	if (user_interval != ULLONG_MAX)
@@ -968,13 +964,11 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 	} else {
 		fprintf(stderr, "frequency and count are zero, aborting\n");
 		err = -EINVAL;
-		goto out_free_event_array;
+		goto out_free_fd;
 	}
 
 	err = __cmd_record(argc, argv);
 
-out_free_event_array:
-	free(event_array);
 out_free_fd:
 	thread_map__delete(threads);
 	threads = NULL;
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 216b62ed4b89..1bc465215fc6 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1193,8 +1193,6 @@ static void perf_session__mmap_read_counter(struct perf_session *self,
 	md->prev = old;
 }
 
-static struct pollfd *event_array;
-
 static void perf_session__mmap_read(struct perf_session *self)
 {
 	struct perf_evsel *counter;
@@ -1212,10 +1210,10 @@ static void perf_session__mmap_read(struct perf_session *self)
 	}
 }
 
-int nr_poll;
 int group_fd;
 
-static void start_counter(int i, struct perf_evsel *evsel)
+static void start_counter(int i, struct perf_evlist *evlist,
+			  struct perf_evsel *evsel)
 {
 	struct xyarray *mmap_array = evsel->priv;
 	struct mmap_data *mm;
@@ -1281,9 +1279,9 @@ try_again:
 		if (group && group_fd == -1)
 			group_fd = FD(evsel, i, thread_index);
 
-		event_array[nr_poll].fd = FD(evsel, i, thread_index);
-		event_array[nr_poll].events = POLLIN;
-		nr_poll++;
+		evlist->pollfd[evlist->nr_fds].fd = FD(evsel, i, thread_index);
+		evlist->pollfd[evlist->nr_fds].events = POLLIN;
+		evlist->nr_fds++;
 
 		mm = xyarray__entry(mmap_array, i, thread_index);
 		mm->prev = 0;
@@ -1316,11 +1314,11 @@ static int __cmd_top(void)
 	for (i = 0; i < cpus->nr; i++) {
 		group_fd = -1;
 		list_for_each_entry(counter, &evsel_list->entries, node)
-			start_counter(i, counter);
+			start_counter(i, evsel_list, counter);
 	}
 
 	/* Wait for a minimal set of events before starting the snapshot */
-	poll(&event_array[0], nr_poll, 100);
+	poll(evsel_list->pollfd, evsel_list->nr_fds, 100);
 
 	perf_session__mmap_read(session);
 
@@ -1345,7 +1343,7 @@ static int __cmd_top(void)
 		perf_session__mmap_read(session);
 
 		if (hits == samples)
-			ret = poll(event_array, nr_poll, 100);
+			ret = poll(evsel_list->pollfd, evsel_list->nr_fds, 100);
 	}
 
 	return 0;
@@ -1426,11 +1424,6 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 		usage_with_options(top_usage, options);
 	}
 
-	event_array = malloc((sizeof(struct pollfd) *
-			      MAX_NR_CPUS * MAX_COUNTERS * threads->nr));
-	if (!event_array)
-		return -ENOMEM;
-
 	/* CPU and PID are mutually exclusive */
 	if (target_tid > 0 && cpu_list) {
 		printf("WARNING: PID switch overriding CPU\n");
@@ -1480,6 +1473,9 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 		pos->attr.sample_period = default_interval;
 	}
 
+	if (perf_evlist__alloc_pollfd(evsel_list, cpus->nr, threads->nr) < 0)
+		goto out_free_fd;
+
 	sym_evsel = list_entry(evsel_list->entries.next, struct perf_evsel, node);
 
 	symbol_conf.priv_size = (sizeof(struct sym_entry) +
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 7b4faec23737..2abf949259d0 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1,3 +1,4 @@
+#include <poll.h>
 #include "evlist.h"
 #include "evsel.h"
 #include "util.h"
@@ -28,6 +29,7 @@ static void perf_evlist__purge(struct perf_evlist *evlist)
 void perf_evlist__delete(struct perf_evlist *evlist)
 {
 	perf_evlist__purge(evlist);
+	free(evlist->pollfd);
 	free(evlist);
 }
 
@@ -51,3 +53,10 @@ int perf_evlist__add_default(struct perf_evlist *evlist)
 	perf_evlist__add(evlist, evsel);
 	return 0;
 }
+
+int perf_evlist__alloc_pollfd(struct perf_evlist *evlist, int ncpus, int nthreads)
+{
+	int nfds = ncpus * nthreads * evlist->nr_entries;
+	evlist->pollfd = malloc(sizeof(struct pollfd) * nfds);
+	return evlist->pollfd != NULL ? 0 : -ENOMEM;
+}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 48db91a8abf3..a7d7e122e3c6 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -3,9 +3,13 @@
 
 #include <linux/list.h>
 
+struct pollfd;
+
 struct perf_evlist {
 	struct list_head entries;
 	int		 nr_entries;
+	int		 nr_fds;
+	struct pollfd	 *pollfd;
 };
 
 struct perf_evsel;
@@ -16,4 +20,6 @@ void perf_evlist__delete(struct perf_evlist *evlist);
 void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry);
 int perf_evlist__add_default(struct perf_evlist *evlist);
 
+int perf_evlist__alloc_pollfd(struct perf_evlist *evlist, int ncpus, int nthreads);
+
 #endif /* __PERF_EVLIST_H */
-- 
cgit v1.2.3


From f08199d314458610d4ca52f8e86e0a4ec7a7bc54 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 11 Jan 2011 23:42:19 -0200
Subject: perf evsel: Support event groups

The perf_evsel__open now have an extra boolean argument specifying if
event grouping is desired.

The first file descriptor created on a CPU becomes the group leader.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c |  4 ++--
 tools/perf/builtin-test.c |  4 ++--
 tools/perf/util/evsel.c   | 27 +++++++++++++++++----------
 tools/perf/util/evsel.h   | 10 ++++++----
 4 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index da9090245934..b5fe522f11dc 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -169,7 +169,7 @@ static int create_perf_stat_counter(struct perf_evsel *evsel)
 				    PERF_FORMAT_TOTAL_TIME_RUNNING;
 
 	if (system_wide)
-		return perf_evsel__open_per_cpu(evsel, cpus);
+		return perf_evsel__open_per_cpu(evsel, cpus, false);
 
 	attr->inherit = !no_inherit;
 	if (target_pid == -1 && target_tid == -1) {
@@ -177,7 +177,7 @@ static int create_perf_stat_counter(struct perf_evsel *evsel)
 		attr->enable_on_exec = 1;
 	}
 
-	return perf_evsel__open_per_thread(evsel, threads);
+	return perf_evsel__open_per_thread(evsel, threads, false);
 }
 
 /*
diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c
index 5dcdba653d70..4282d671b161 100644
--- a/tools/perf/builtin-test.c
+++ b/tools/perf/builtin-test.c
@@ -289,7 +289,7 @@ static int test__open_syscall_event(void)
 		goto out_thread_map_delete;
 	}
 
-	if (perf_evsel__open_per_thread(evsel, threads) < 0) {
+	if (perf_evsel__open_per_thread(evsel, threads, false) < 0) {
 		pr_debug("failed to open counter: %s, "
 			 "tweak /proc/sys/kernel/perf_event_paranoid?\n",
 			 strerror(errno));
@@ -364,7 +364,7 @@ static int test__open_syscall_event_on_all_cpus(void)
 		goto out_thread_map_delete;
 	}
 
-	if (perf_evsel__open(evsel, cpus, threads) < 0) {
+	if (perf_evsel__open(evsel, cpus, threads, false) < 0) {
 		pr_debug("failed to open counter: %s, "
 			 "tweak /proc/sys/kernel/perf_event_paranoid?\n",
 			 strerror(errno));
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index f5cfed60af98..da473ec93c75 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -128,7 +128,7 @@ int __perf_evsel__read(struct perf_evsel *evsel,
 }
 
 static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
-			      struct thread_map *threads)
+			      struct thread_map *threads, bool group)
 {
 	int cpu, thread;
 
@@ -137,12 +137,18 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 		return -1;
 
 	for (cpu = 0; cpu < cpus->nr; cpu++) {
+		int group_fd = -1;
+
 		for (thread = 0; thread < threads->nr; thread++) {
 			FD(evsel, cpu, thread) = sys_perf_event_open(&evsel->attr,
 								     threads->map[thread],
-								     cpus->map[cpu], -1, 0);
+								     cpus->map[cpu],
+								     group_fd, 0);
 			if (FD(evsel, cpu, thread) < 0)
 				goto out_close;
+
+			if (group && group_fd == -1)
+				group_fd = FD(evsel, cpu, thread);
 		}
 	}
 
@@ -175,10 +181,9 @@ static struct {
 	.threads = { -1, },
 };
 
-int perf_evsel__open(struct perf_evsel *evsel,
-		     struct cpu_map *cpus, struct thread_map *threads)
+int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
+		     struct thread_map *threads, bool group)
 {
-
 	if (cpus == NULL) {
 		/* Work around old compiler warnings about strict aliasing */
 		cpus = &empty_cpu_map.map;
@@ -187,15 +192,17 @@ int perf_evsel__open(struct perf_evsel *evsel,
 	if (threads == NULL)
 		threads = &empty_thread_map.map;
 
-	return __perf_evsel__open(evsel, cpus, threads);
+	return __perf_evsel__open(evsel, cpus, threads, group);
 }
 
-int perf_evsel__open_per_cpu(struct perf_evsel *evsel, struct cpu_map *cpus)
+int perf_evsel__open_per_cpu(struct perf_evsel *evsel,
+			     struct cpu_map *cpus, bool group)
 {
-	return __perf_evsel__open(evsel, cpus, &empty_thread_map.map);
+	return __perf_evsel__open(evsel, cpus, &empty_thread_map.map, group);
 }
 
-int perf_evsel__open_per_thread(struct perf_evsel *evsel, struct thread_map *threads)
+int perf_evsel__open_per_thread(struct perf_evsel *evsel,
+				struct thread_map *threads, bool group)
 {
-	return __perf_evsel__open(evsel, &empty_cpu_map.map, threads);
+	return __perf_evsel__open(evsel, &empty_cpu_map.map, threads, group);
 }
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index b2d755fe88a5..0962b500cb6d 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -45,10 +45,12 @@ int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus);
 void perf_evsel__free_fd(struct perf_evsel *evsel);
 void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
 
-int perf_evsel__open_per_cpu(struct perf_evsel *evsel, struct cpu_map *cpus);
-int perf_evsel__open_per_thread(struct perf_evsel *evsel, struct thread_map *threads);
-int perf_evsel__open(struct perf_evsel *evsel, 
-		     struct cpu_map *cpus, struct thread_map *threads);
+int perf_evsel__open_per_cpu(struct perf_evsel *evsel,
+			     struct cpu_map *cpus, bool group);
+int perf_evsel__open_per_thread(struct perf_evsel *evsel,
+				struct thread_map *threads, bool group);
+int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
+		     struct thread_map *threads, bool group);
 
 #define perf_evsel__match(evsel, t, c)		\
 	(evsel->attr.type == PERF_TYPE_##t &&	\
-- 
cgit v1.2.3


From 9d04f1781772e11bd58806391555fc23ebb54377 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 12 Jan 2011 00:08:18 -0200
Subject: perf evsel: Allow specifying if the inherit bit should be set

As this is a per-cpu attribute, we can't set it up in advance and use it
for all the calls.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c |  4 ++--
 tools/perf/builtin-test.c |  4 ++--
 tools/perf/util/evsel.c   | 16 +++++++++-------
 tools/perf/util/evsel.h   |  6 +++---
 4 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index b5fe522f11dc..e2a2d02c5c43 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -169,7 +169,7 @@ static int create_perf_stat_counter(struct perf_evsel *evsel)
 				    PERF_FORMAT_TOTAL_TIME_RUNNING;
 
 	if (system_wide)
-		return perf_evsel__open_per_cpu(evsel, cpus, false);
+		return perf_evsel__open_per_cpu(evsel, cpus, false, false);
 
 	attr->inherit = !no_inherit;
 	if (target_pid == -1 && target_tid == -1) {
@@ -177,7 +177,7 @@ static int create_perf_stat_counter(struct perf_evsel *evsel)
 		attr->enable_on_exec = 1;
 	}
 
-	return perf_evsel__open_per_thread(evsel, threads, false);
+	return perf_evsel__open_per_thread(evsel, threads, false, false);
 }
 
 /*
diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c
index 4282d671b161..7287158c4830 100644
--- a/tools/perf/builtin-test.c
+++ b/tools/perf/builtin-test.c
@@ -289,7 +289,7 @@ static int test__open_syscall_event(void)
 		goto out_thread_map_delete;
 	}
 
-	if (perf_evsel__open_per_thread(evsel, threads, false) < 0) {
+	if (perf_evsel__open_per_thread(evsel, threads, false, false) < 0) {
 		pr_debug("failed to open counter: %s, "
 			 "tweak /proc/sys/kernel/perf_event_paranoid?\n",
 			 strerror(errno));
@@ -364,7 +364,7 @@ static int test__open_syscall_event_on_all_cpus(void)
 		goto out_thread_map_delete;
 	}
 
-	if (perf_evsel__open(evsel, cpus, threads, false) < 0) {
+	if (perf_evsel__open(evsel, cpus, threads, false, false) < 0) {
 		pr_debug("failed to open counter: %s, "
 			 "tweak /proc/sys/kernel/perf_event_paranoid?\n",
 			 strerror(errno));
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index da473ec93c75..82a00536892a 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -128,7 +128,7 @@ int __perf_evsel__read(struct perf_evsel *evsel,
 }
 
 static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
-			      struct thread_map *threads, bool group)
+			      struct thread_map *threads, bool group, bool inherit)
 {
 	int cpu, thread;
 
@@ -139,6 +139,8 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 	for (cpu = 0; cpu < cpus->nr; cpu++) {
 		int group_fd = -1;
 
+		evsel->attr.inherit = (cpus->map[cpu] < 0) && inherit;
+
 		for (thread = 0; thread < threads->nr; thread++) {
 			FD(evsel, cpu, thread) = sys_perf_event_open(&evsel->attr,
 								     threads->map[thread],
@@ -182,7 +184,7 @@ static struct {
 };
 
 int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
-		     struct thread_map *threads, bool group)
+		     struct thread_map *threads, bool group, bool inherit)
 {
 	if (cpus == NULL) {
 		/* Work around old compiler warnings about strict aliasing */
@@ -192,17 +194,17 @@ int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 	if (threads == NULL)
 		threads = &empty_thread_map.map;
 
-	return __perf_evsel__open(evsel, cpus, threads, group);
+	return __perf_evsel__open(evsel, cpus, threads, group, inherit);
 }
 
 int perf_evsel__open_per_cpu(struct perf_evsel *evsel,
-			     struct cpu_map *cpus, bool group)
+			     struct cpu_map *cpus, bool group, bool inherit)
 {
-	return __perf_evsel__open(evsel, cpus, &empty_thread_map.map, group);
+	return __perf_evsel__open(evsel, cpus, &empty_thread_map.map, group, inherit);
 }
 
 int perf_evsel__open_per_thread(struct perf_evsel *evsel,
-				struct thread_map *threads, bool group)
+				struct thread_map *threads, bool group, bool inherit)
 {
-	return __perf_evsel__open(evsel, &empty_cpu_map.map, threads, group);
+	return __perf_evsel__open(evsel, &empty_cpu_map.map, threads, group, inherit);
 }
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 0962b500cb6d..1594696bd127 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -46,11 +46,11 @@ void perf_evsel__free_fd(struct perf_evsel *evsel);
 void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
 
 int perf_evsel__open_per_cpu(struct perf_evsel *evsel,
-			     struct cpu_map *cpus, bool group);
+			     struct cpu_map *cpus, bool group, bool inherit);
 int perf_evsel__open_per_thread(struct perf_evsel *evsel,
-				struct thread_map *threads, bool group);
+				struct thread_map *threads, bool group, bool inherit);
 int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
-		     struct thread_map *threads, bool group);
+		     struct thread_map *threads, bool group, bool inherit);
 
 #define perf_evsel__match(evsel, t, c)		\
 	(evsel->attr.type == PERF_TYPE_##t &&	\
-- 
cgit v1.2.3


From 72cb7013e08dec29631e0447f9496b7bacd3e14b Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 12 Jan 2011 10:52:47 -0200
Subject: perf top: Use perf_evsel__open

Now that it handles group_fd and inherit we can use it, sharing it with
stat.

Next step: 'perf record' should use, then move the mmap_array out of
->priv and into perf_evsel, with top and record sharing this, and at the
same time, write a 'perf test' stress test.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-top.c | 92 ++++++++++++++++++++++--------------------------
 1 file changed, 42 insertions(+), 50 deletions(-)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 1bc465215fc6..15d89bede2fb 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1210,39 +1210,50 @@ static void perf_session__mmap_read(struct perf_session *self)
 	}
 }
 
-int group_fd;
-
 static void start_counter(int i, struct perf_evlist *evlist,
 			  struct perf_evsel *evsel)
 {
 	struct xyarray *mmap_array = evsel->priv;
 	struct mmap_data *mm;
-	struct perf_event_attr *attr;
-	int cpu = -1;
 	int thread_index;
 
-	if (target_tid == -1)
-		cpu = cpus->map[i];
-
-	attr = &evsel->attr;
+	for (thread_index = 0; thread_index < threads->nr; thread_index++) {
+		assert(FD(evsel, i, thread_index) >= 0);
+		fcntl(FD(evsel, i, thread_index), F_SETFL, O_NONBLOCK);
 
-	attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+		evlist->pollfd[evlist->nr_fds].fd = FD(evsel, i, thread_index);
+		evlist->pollfd[evlist->nr_fds].events = POLLIN;
+		evlist->nr_fds++;
 
-	if (freq) {
-		attr->sample_type	|= PERF_SAMPLE_PERIOD;
-		attr->freq		= 1;
-		attr->sample_freq	= freq;
+		mm = xyarray__entry(mmap_array, i, thread_index);
+		mm->prev = 0;
+		mm->mask = mmap_pages*page_size - 1;
+		mm->base = mmap(NULL, (mmap_pages+1)*page_size,
+				PROT_READ, MAP_SHARED, FD(evsel, i, thread_index), 0);
+		if (mm->base == MAP_FAILED)
+			die("failed to mmap with %d (%s)\n", errno, strerror(errno));
 	}
+}
+
+static void start_counters(struct perf_evlist *evlist)
+{
+	struct perf_evsel *counter;
+	int i;
 
-	attr->inherit		= (cpu < 0) && inherit;
-	attr->mmap		= 1;
+	list_for_each_entry(counter, &evlist->entries, node) {
+		struct perf_event_attr *attr = &counter->attr;
 
-	for (thread_index = 0; thread_index < threads->nr; thread_index++) {
-try_again:
-		FD(evsel, i, thread_index) = sys_perf_event_open(attr,
-				threads->map[thread_index], cpu, group_fd, 0);
+		attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+
+		if (freq) {
+			attr->sample_type |= PERF_SAMPLE_PERIOD;
+			attr->freq	  = 1;
+			attr->sample_freq = freq;
+		}
 
-		if (FD(evsel, i, thread_index) < 0) {
+		attr->mmap = 1;
+try_again:
+		if (perf_evsel__open(counter, cpus, threads, group, inherit) < 0) {
 			int err = errno;
 
 			if (err == EPERM || err == EACCES)
@@ -1254,8 +1265,8 @@ try_again:
 			 * based cpu-clock-tick sw counter, which
 			 * is always available even if no PMU support:
 			 */
-			if (attr->type == PERF_TYPE_HARDWARE
-					&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
+			if (attr->type == PERF_TYPE_HARDWARE &&
+			    attr->config == PERF_COUNT_HW_CPU_CYCLES) {
 
 				if (verbose)
 					warning(" ... trying to fall back to cpu-clock-ticks\n");
@@ -1265,39 +1276,24 @@ try_again:
 				goto try_again;
 			}
 			printf("\n");
-			error("sys_perf_event_open() syscall returned with %d (%s).  /bin/dmesg may provide additional information.\n",
-					FD(evsel, i, thread_index), strerror(err));
+			error("sys_perf_event_open() syscall returned with %d "
+			      "(%s).  /bin/dmesg may provide additional information.\n",
+			      err, strerror(err));
 			die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
 			exit(-1);
 		}
-		assert(FD(evsel, i, thread_index) >= 0);
-		fcntl(FD(evsel, i, thread_index), F_SETFL, O_NONBLOCK);
-
-		/*
-		 * First counter acts as the group leader:
-		 */
-		if (group && group_fd == -1)
-			group_fd = FD(evsel, i, thread_index);
-
-		evlist->pollfd[evlist->nr_fds].fd = FD(evsel, i, thread_index);
-		evlist->pollfd[evlist->nr_fds].events = POLLIN;
-		evlist->nr_fds++;
+	}
 
-		mm = xyarray__entry(mmap_array, i, thread_index);
-		mm->prev = 0;
-		mm->mask = mmap_pages*page_size - 1;
-		mm->base = mmap(NULL, (mmap_pages+1)*page_size,
-				PROT_READ, MAP_SHARED, FD(evsel, i, thread_index), 0);
-		if (mm->base == MAP_FAILED)
-			die("failed to mmap with %d (%s)\n", errno, strerror(errno));
+	for (i = 0; i < cpus->nr; i++) {
+		list_for_each_entry(counter, &evlist->entries, node)
+			start_counter(i, evsel_list, counter);
 	}
 }
 
 static int __cmd_top(void)
 {
 	pthread_t thread;
-	struct perf_evsel *counter;
-	int i, ret;
+	int ret;
 	/*
 	 * FIXME: perf_session__new should allow passing a O_MMAP, so that all this
 	 * mmap reading, etc is encapsulated in it. Use O_WRONLY for now.
@@ -1311,11 +1307,7 @@ static int __cmd_top(void)
 	else
 		event__synthesize_threads(event__process, session);
 
-	for (i = 0; i < cpus->nr; i++) {
-		group_fd = -1;
-		list_for_each_entry(counter, &evsel_list->entries, node)
-			start_counter(i, evsel_list, counter);
-	}
+	start_counters(evsel_list);
 
 	/* Wait for a minimal set of events before starting the snapshot */
 	poll(evsel_list->pollfd, evsel_list->nr_fds, 100);
-- 
cgit v1.2.3


From dd7927f4f8ee75b032ff15aeef4bda49719a443a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 12 Jan 2011 14:28:51 -0200
Subject: perf record: Use perf_evsel__open

Now its time to factor out the mmap handling bits into the perf_evsel
class.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-record.c | 233 +++++++++++++++++++++-----------------------
 1 file changed, 113 insertions(+), 120 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 1614d89b4765..ec43f2eb7b72 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -72,8 +72,6 @@ static struct perf_evlist	*evsel_list;
 static long			samples				=      0;
 static u64			bytes_written			=      0;
 
-static int			nr_cpu				=      0;
-
 static int			file_new			=      1;
 static off_t			post_processing_offset;
 
@@ -208,8 +206,6 @@ static void sig_atexit(void)
 	kill(getpid(), signr);
 }
 
-static int group_fd;
-
 static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int nr)
 {
 	struct perf_header_attr *h_attr;
@@ -234,7 +230,6 @@ static void create_counter(struct perf_evlist *evlist,
 	char *filter = evsel->filter;
 	struct perf_event_attr *attr = &evsel->attr;
 	struct perf_header_attr *h_attr;
-	int track = !evsel->idx; /* only the first counter needs these */
 	int thread_index;
 	int ret;
 	struct {
@@ -243,19 +238,77 @@ static void create_counter(struct perf_evlist *evlist,
 		u64 time_running;
 		u64 id;
 	} read_data;
-	/*
- 	 * Check if parse_single_tracepoint_event has already asked for
- 	 * PERF_SAMPLE_TIME.
- 	 *
-	 * XXX this is kludgy but short term fix for problems introduced by
-	 * eac23d1c that broke 'perf script' by having different sample_types
-	 * when using multiple tracepoint events when we use a perf binary
-	 * that tries to use sample_id_all on an older kernel.
- 	 *
- 	 * We need to move counter creation to perf_session, support
- 	 * different sample_types, etc.
- 	 */
-	bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
+
+	for (thread_index = 0; thread_index < threads->nr; thread_index++) {
+		h_attr = get_header_attr(attr, evsel->idx);
+		if (h_attr == NULL)
+			die("nomem\n");
+
+		if (!file_new) {
+			if (memcmp(&h_attr->attr, attr, sizeof(*attr))) {
+				fprintf(stderr, "incompatible append\n");
+				exit(-1);
+			}
+		}
+
+		if (read(FD(evsel, cpu, thread_index), &read_data, sizeof(read_data)) == -1) {
+			perror("Unable to read perf file descriptor");
+			exit(-1);
+		}
+
+		if (perf_header_attr__add_id(h_attr, read_data.id) < 0) {
+			pr_warning("Not enough memory to add id\n");
+			exit(-1);
+		}
+
+		assert(FD(evsel, cpu, thread_index) >= 0);
+		fcntl(FD(evsel, cpu, thread_index), F_SETFL, O_NONBLOCK);
+
+		if (evsel->idx || thread_index) {
+			struct perf_evsel *first;
+			first = list_entry(evlist->entries.next, struct perf_evsel, node);
+			ret = ioctl(FD(evsel, cpu, thread_index),
+				    PERF_EVENT_IOC_SET_OUTPUT,
+				    FD(first, cpu, 0));
+			if (ret) {
+				error("failed to set output: %d (%s)\n", errno,
+						strerror(errno));
+				exit(-1);
+			}
+		} else {
+			mmap_array[cpu].prev = 0;
+			mmap_array[cpu].mask = mmap_pages*page_size - 1;
+			mmap_array[cpu].base = mmap(NULL, (mmap_pages+1)*page_size,
+				PROT_READ | PROT_WRITE, MAP_SHARED, FD(evsel, cpu, thread_index), 0);
+			if (mmap_array[cpu].base == MAP_FAILED) {
+				error("failed to mmap with %d (%s)\n", errno, strerror(errno));
+				exit(-1);
+			}
+
+			evlist->pollfd[evlist->nr_fds].fd = FD(evsel, cpu, thread_index);
+			evlist->pollfd[evlist->nr_fds].events = POLLIN;
+			evlist->nr_fds++;
+		}
+
+		if (filter != NULL) {
+			ret = ioctl(FD(evsel, cpu, thread_index),
+				    PERF_EVENT_IOC_SET_FILTER, filter);
+			if (ret) {
+				error("failed to set filter with %d (%s)\n", errno,
+						strerror(errno));
+				exit(-1);
+			}
+		}
+	}
+
+	if (!sample_type)
+		sample_type = attr->sample_type;
+}
+
+static void config_attr(struct perf_evsel *evsel, struct perf_evlist *evlist)
+{
+	struct perf_event_attr *attr = &evsel->attr;
+	int track = !evsel->idx; /* only the first counter needs these */
 
 	attr->read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
 				  PERF_FORMAT_TOTAL_TIME_RUNNING |
@@ -315,19 +368,39 @@ static void create_counter(struct perf_evlist *evlist,
 
 	attr->mmap		= track;
 	attr->comm		= track;
-	attr->inherit		= !no_inherit;
+
 	if (target_pid == -1 && target_tid == -1 && !system_wide) {
 		attr->disabled = 1;
 		attr->enable_on_exec = 1;
 	}
-retry_sample_id:
-	attr->sample_id_all = sample_id_all_avail ? 1 : 0;
+}
 
-	for (thread_index = 0; thread_index < threads->nr; thread_index++) {
-try_again:
-		FD(evsel, nr_cpu, thread_index) = sys_perf_event_open(attr, threads->map[thread_index], cpu, group_fd, 0);
+static void open_counters(struct perf_evlist *evlist)
+{
+	struct perf_evsel *pos;
+	int cpu;
+
+	list_for_each_entry(pos, &evlist->entries, node) {
+		struct perf_event_attr *attr = &pos->attr;
+		/*
+		 * Check if parse_single_tracepoint_event has already asked for
+		 * PERF_SAMPLE_TIME.
+		 *
+		 * XXX this is kludgy but short term fix for problems introduced by
+		 * eac23d1c that broke 'perf script' by having different sample_types
+		 * when using multiple tracepoint events when we use a perf binary
+		 * that tries to use sample_id_all on an older kernel.
+		 *
+		 * We need to move counter creation to perf_session, support
+		 * different sample_types, etc.
+		 */
+		bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
 
-		if (FD(evsel, nr_cpu, thread_index) < 0) {
+		config_attr(pos, evlist);
+retry_sample_id:
+		attr->sample_id_all = sample_id_all_avail ? 1 : 0;
+try_again:
+		if (perf_evsel__open(pos, cpus, threads, group, !no_inherit) < 0) {
 			int err = errno;
 
 			if (err == EPERM || err == EACCES)
@@ -364,7 +437,7 @@ try_again:
 			}
 			printf("\n");
 			error("sys_perf_event_open() syscall returned with %d (%s).  /bin/dmesg may provide additional information.\n",
-			      FD(evsel, nr_cpu, thread_index), strerror(err));
+			      err, strerror(err));
 
 #if defined(__i386__) || defined(__x86_64__)
 			if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP)
@@ -375,90 +448,13 @@ try_again:
 #endif
 
 			die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
-			exit(-1);
-		}
-
-		h_attr = get_header_attr(attr, evsel->idx);
-		if (h_attr == NULL)
-			die("nomem\n");
-
-		if (!file_new) {
-			if (memcmp(&h_attr->attr, attr, sizeof(*attr))) {
-				fprintf(stderr, "incompatible append\n");
-				exit(-1);
-			}
-		}
-
-		if (read(FD(evsel, nr_cpu, thread_index), &read_data, sizeof(read_data)) == -1) {
-			perror("Unable to read perf file descriptor");
-			exit(-1);
-		}
-
-		if (perf_header_attr__add_id(h_attr, read_data.id) < 0) {
-			pr_warning("Not enough memory to add id\n");
-			exit(-1);
-		}
-
-		assert(FD(evsel, nr_cpu, thread_index) >= 0);
-		fcntl(FD(evsel, nr_cpu, thread_index), F_SETFL, O_NONBLOCK);
-
-		/*
-		 * First counter acts as the group leader:
-		 */
-		if (group && group_fd == -1)
-			group_fd = FD(evsel, nr_cpu, thread_index);
-
-		if (evsel->idx || thread_index) {
-			struct perf_evsel *first;
-			first = list_entry(evlist->entries.next, struct perf_evsel, node);
-			ret = ioctl(FD(evsel, nr_cpu, thread_index),
-				    PERF_EVENT_IOC_SET_OUTPUT,
-				    FD(first, nr_cpu, 0));
-			if (ret) {
-				error("failed to set output: %d (%s)\n", errno,
-						strerror(errno));
-				exit(-1);
-			}
-		} else {
-			mmap_array[nr_cpu].prev = 0;
-			mmap_array[nr_cpu].mask = mmap_pages*page_size - 1;
-			mmap_array[nr_cpu].base = mmap(NULL, (mmap_pages+1)*page_size,
-				PROT_READ | PROT_WRITE, MAP_SHARED, FD(evsel, nr_cpu, thread_index), 0);
-			if (mmap_array[nr_cpu].base == MAP_FAILED) {
-				error("failed to mmap with %d (%s)\n", errno, strerror(errno));
-				exit(-1);
-			}
-
-			evlist->pollfd[evlist->nr_fds].fd = FD(evsel, nr_cpu, thread_index);
-			evlist->pollfd[evlist->nr_fds].events = POLLIN;
-			evlist->nr_fds++;
-		}
-
-		if (filter != NULL) {
-			ret = ioctl(FD(evsel, nr_cpu, thread_index),
-				    PERF_EVENT_IOC_SET_FILTER, filter);
-			if (ret) {
-				error("failed to set filter with %d (%s)\n", errno,
-						strerror(errno));
-				exit(-1);
-			}
 		}
 	}
 
-	if (!sample_type)
-		sample_type = attr->sample_type;
-}
-
-static void open_counters(struct perf_evlist *evlist, int cpu)
-{
-	struct perf_evsel *pos;
-
-	group_fd = -1;
-
-	list_for_each_entry(pos, &evlist->entries, node)
-		create_counter(evlist, pos, cpu);
-
-	nr_cpu++;
+	for (cpu = 0; cpu < cpus->nr; ++cpu) {
+		list_for_each_entry(pos, &evlist->entries, node)
+			create_counter(evlist, pos, cpu);
+	}
 }
 
 static int process_buildids(void)
@@ -533,7 +529,7 @@ static void mmap_read_all(void)
 {
 	int i;
 
-	for (i = 0; i < nr_cpu; i++) {
+	for (i = 0; i < cpus->nr; i++) {
 		if (mmap_array[i].base)
 			mmap_read(&mmap_array[i]);
 	}
@@ -673,12 +669,7 @@ static int __cmd_record(int argc, const char **argv)
 		close(child_ready_pipe[0]);
 	}
 
-	if (!system_wide && no_inherit && !cpu_list) {
-		open_counters(evsel_list, -1);
-	} else {
-		for (i = 0; i < cpus->nr; i++)
-			open_counters(evsel_list, cpus->map[i]);
-	}
+	open_counters(evsel_list);
 
 	perf_session__set_sample_type(session, sample_type);
 
@@ -795,7 +786,7 @@ static int __cmd_record(int argc, const char **argv)
 		}
 
 		if (done) {
-			for (i = 0; i < nr_cpu; i++) {
+			for (i = 0; i < cpus->nr; i++) {
 				struct perf_evsel *pos;
 
 				list_for_each_entry(pos, &evsel_list->entries, node) {
@@ -933,11 +924,13 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 		usage_with_options(record_usage, record_options);
 	}
 
-	cpus = cpu_map__new(cpu_list);
-	if (cpus == NULL) {
-		perror("failed to parse CPUs map");
-		return -1;
-	}
+	if (target_tid != -1)
+		cpus = cpu_map__dummy_new();
+	else
+		cpus = cpu_map__new(cpu_list);
+
+	if (cpus == NULL)
+		usage_with_options(record_usage, record_options);
 
 	list_for_each_entry(pos, &evsel_list->entries, node) {
 		if (perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0)
-- 
cgit v1.2.3


From 70082dd92c4b288bd723a77897e2b555f0e63113 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 12 Jan 2011 17:03:24 -0200
Subject: perf evsel: Introduce mmap support

Out of the code in 'perf top'. Record is next in line.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-top.c | 71 ++++--------------------------------------------
 tools/perf/perf.h        | 14 ++++++++++
 tools/perf/util/evlist.c |  8 ++++++
 tools/perf/util/evlist.h |  1 +
 tools/perf/util/evsel.c  | 71 ++++++++++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/evsel.h  |  8 ++++++
 6 files changed, 107 insertions(+), 66 deletions(-)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 15d89bede2fb..7d723ad0bfa9 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1095,43 +1095,12 @@ static void event__process_sample(const event_t *self,
 	}
 }
 
-struct mmap_data {
-	void			*base;
-	int			mask;
-	unsigned int		prev;
-};
-
-static int perf_evsel__alloc_mmap_per_thread(struct perf_evsel *evsel,
-					     int ncpus, int nthreads)
-{
-	evsel->priv = xyarray__new(ncpus, nthreads, sizeof(struct mmap_data));
-	return evsel->priv != NULL ? 0 : -ENOMEM;
-}
-
-static void perf_evsel__free_mmap(struct perf_evsel *evsel)
-{
-	xyarray__delete(evsel->priv);
-	evsel->priv = NULL;
-}
-
-static unsigned int mmap_read_head(struct mmap_data *md)
-{
-	struct perf_event_mmap_page *pc = md->base;
-	int head;
-
-	head = pc->data_head;
-	rmb();
-
-	return head;
-}
-
 static void perf_session__mmap_read_counter(struct perf_session *self,
 					    struct perf_evsel *evsel,
 					    int cpu, int thread_idx)
 {
-	struct xyarray *mmap_array = evsel->priv;
-	struct mmap_data *md = xyarray__entry(mmap_array, cpu, thread_idx);
-	unsigned int head = mmap_read_head(md);
+	struct perf_mmap *md = xyarray__entry(evsel->mmap, cpu, thread_idx);
+	unsigned int head = perf_mmap__read_head(md);
 	unsigned int old = md->prev;
 	unsigned char *data = md->base + page_size;
 	struct sample_data sample;
@@ -1210,35 +1179,9 @@ static void perf_session__mmap_read(struct perf_session *self)
 	}
 }
 
-static void start_counter(int i, struct perf_evlist *evlist,
-			  struct perf_evsel *evsel)
-{
-	struct xyarray *mmap_array = evsel->priv;
-	struct mmap_data *mm;
-	int thread_index;
-
-	for (thread_index = 0; thread_index < threads->nr; thread_index++) {
-		assert(FD(evsel, i, thread_index) >= 0);
-		fcntl(FD(evsel, i, thread_index), F_SETFL, O_NONBLOCK);
-
-		evlist->pollfd[evlist->nr_fds].fd = FD(evsel, i, thread_index);
-		evlist->pollfd[evlist->nr_fds].events = POLLIN;
-		evlist->nr_fds++;
-
-		mm = xyarray__entry(mmap_array, i, thread_index);
-		mm->prev = 0;
-		mm->mask = mmap_pages*page_size - 1;
-		mm->base = mmap(NULL, (mmap_pages+1)*page_size,
-				PROT_READ, MAP_SHARED, FD(evsel, i, thread_index), 0);
-		if (mm->base == MAP_FAILED)
-			die("failed to mmap with %d (%s)\n", errno, strerror(errno));
-	}
-}
-
 static void start_counters(struct perf_evlist *evlist)
 {
 	struct perf_evsel *counter;
-	int i;
 
 	list_for_each_entry(counter, &evlist->entries, node) {
 		struct perf_event_attr *attr = &counter->attr;
@@ -1282,11 +1225,9 @@ try_again:
 			die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
 			exit(-1);
 		}
-	}
 
-	for (i = 0; i < cpus->nr; i++) {
-		list_for_each_entry(counter, &evlist->entries, node)
-			start_counter(i, evsel_list, counter);
+		if (perf_evsel__mmap(counter, cpus, threads, mmap_pages, evlist) < 0)
+			die("failed to mmap with %d (%s)\n", errno, strerror(errno));
 	}
 }
 
@@ -1453,7 +1394,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 		usage_with_options(top_usage, options);
 
 	list_for_each_entry(pos, &evsel_list->entries, node) {
-		if (perf_evsel__alloc_mmap_per_thread(pos, cpus->nr, threads->nr) < 0 ||
+		if (perf_evsel__alloc_mmap(pos, cpus->nr, threads->nr) < 0 ||
 		    perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0)
 			goto out_free_fd;
 		/*
@@ -1485,8 +1426,6 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 
 	status = __cmd_top();
 out_free_fd:
-	list_for_each_entry(pos, &evsel_list->entries, node)
-		perf_evsel__free_mmap(pos);
 	perf_evlist__delete(evsel_list);
 
 	return status;
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 95aaf565c704..5fb5e1f11d1c 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -94,6 +94,20 @@ void get_term_dimensions(struct winsize *ws);
 #include "util/types.h"
 #include <stdbool.h>
 
+struct perf_mmap {
+	void			*base;
+	int			mask;
+	unsigned int		prev;
+};
+
+static inline unsigned int perf_mmap__read_head(struct perf_mmap *mm)
+{
+	struct perf_event_mmap_page *pc = mm->base;
+	int head = pc->data_head;
+	rmb();
+	return head;
+}
+
 /*
  * prctl(PR_TASK_PERF_EVENTS_DISABLE) will (cheaply) disable all
  * counters in the current task.
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 2abf949259d0..6d4129214ee8 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -60,3 +60,11 @@ int perf_evlist__alloc_pollfd(struct perf_evlist *evlist, int ncpus, int nthread
 	evlist->pollfd = malloc(sizeof(struct pollfd) * nfds);
 	return evlist->pollfd != NULL ? 0 : -ENOMEM;
 }
+
+void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd)
+{
+	fcntl(fd, F_SETFL, O_NONBLOCK);
+	evlist->pollfd[evlist->nr_fds].fd = fd;
+	evlist->pollfd[evlist->nr_fds].events = POLLIN;
+	evlist->nr_fds++;
+}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index a7d7e122e3c6..16bbfcba8ca8 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -21,5 +21,6 @@ void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry);
 int perf_evlist__add_default(struct perf_evlist *evlist);
 
 int perf_evlist__alloc_pollfd(struct perf_evlist *evlist, int ncpus, int nthreads);
+void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd);
 
 #endif /* __PERF_EVLIST_H */
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 82a00536892a..f5006958f8da 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1,9 +1,13 @@
 #include "evsel.h"
+#include "evlist.h"
 #include "../perf.h"
 #include "util.h"
 #include "cpumap.h"
 #include "thread.h"
 
+#include <unistd.h>
+#include <sys/mman.h>
+
 #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
 
 struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx)
@@ -49,10 +53,32 @@ void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
 		}
 }
 
+void perf_evsel__munmap(struct perf_evsel *evsel, int ncpus, int nthreads)
+{
+	struct perf_mmap *mm;
+	int cpu, thread;
+
+	for (cpu = 0; cpu < ncpus; cpu++)
+		for (thread = 0; thread < nthreads; ++thread) {
+			mm = xyarray__entry(evsel->mmap, cpu, thread);
+			if (mm->base != NULL) {
+				munmap(mm->base, evsel->mmap_len);
+				mm->base = NULL;
+			}
+		}
+}
+
+int perf_evsel__alloc_mmap(struct perf_evsel *evsel, int ncpus, int nthreads)
+{
+	evsel->mmap = xyarray__new(ncpus, nthreads, sizeof(struct perf_mmap));
+	return evsel->mmap != NULL ? 0 : -ENOMEM;
+}
+
 void perf_evsel__delete(struct perf_evsel *evsel)
 {
 	assert(list_empty(&evsel->node));
 	xyarray__delete(evsel->fd);
+	xyarray__delete(evsel->mmap);
 	free(evsel);
 }
 
@@ -208,3 +234,48 @@ int perf_evsel__open_per_thread(struct perf_evsel *evsel,
 {
 	return __perf_evsel__open(evsel, &empty_cpu_map.map, threads, group, inherit);
 }
+
+int perf_evsel__mmap(struct perf_evsel *evsel, struct cpu_map *cpus,
+		     struct thread_map *threads, int pages,
+		     struct perf_evlist *evlist)
+{
+	unsigned int page_size = sysconf(_SC_PAGE_SIZE);
+	int mask = pages * page_size - 1, cpu;
+	struct perf_mmap *mm;
+	int thread;
+
+	if (evsel->mmap == NULL &&
+	    perf_evsel__alloc_mmap(evsel, cpus->nr, threads->nr) < 0)
+		return -ENOMEM;
+
+	evsel->mmap_len = (pages + 1) * page_size;
+
+	for (cpu = 0; cpu < cpus->nr; cpu++) {
+		for (thread = 0; thread < threads->nr; thread++) {
+			mm = xyarray__entry(evsel->mmap, cpu, thread);
+			mm->prev = 0;
+			mm->mask = mask;
+			mm->base = mmap(NULL, evsel->mmap_len, PROT_READ,
+					MAP_SHARED, FD(evsel, cpu, thread), 0);
+			if (mm->base == MAP_FAILED)
+				goto out_unmap;
+
+			if (evlist != NULL)
+				 perf_evlist__add_pollfd(evlist, FD(evsel, cpu, thread));
+		}
+	}
+
+	return 0;
+
+out_unmap:
+	do {
+		while (--thread >= 0) {
+			mm = xyarray__entry(evsel->mmap, cpu, thread);
+			munmap(mm->base, evsel->mmap_len);
+			mm->base = NULL;
+		}
+		thread = threads->nr;
+	} while (--cpu >= 0);
+
+	return -1;
+}
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 1594696bd127..c8fbef299436 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -29,19 +29,23 @@ struct perf_evsel {
 	struct perf_event_attr	attr;
 	char			*filter;
 	struct xyarray		*fd;
+	struct xyarray		*mmap;
 	struct perf_counts	*counts;
+	size_t			mmap_len;
 	int			idx;
 	void			*priv;
 };
 
 struct cpu_map;
 struct thread_map;
+struct perf_evlist;
 
 struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx);
 void perf_evsel__delete(struct perf_evsel *evsel);
 
 int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
 int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus);
+int perf_evsel__alloc_mmap(struct perf_evsel *evsel, int ncpus, int nthreads);
 void perf_evsel__free_fd(struct perf_evsel *evsel);
 void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
 
@@ -51,6 +55,10 @@ int perf_evsel__open_per_thread(struct perf_evsel *evsel,
 				struct thread_map *threads, bool group, bool inherit);
 int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 		     struct thread_map *threads, bool group, bool inherit);
+int perf_evsel__mmap(struct perf_evsel *evsel, struct cpu_map *cpus,
+		     struct thread_map *threads, int pages,
+		     struct perf_evlist *evlist);
+void perf_evsel__munmap(struct perf_evsel *evsel, int ncpus, int nthreads);
 
 #define perf_evsel__match(evsel, t, c)		\
 	(evsel->attr.type == PERF_TYPE_##t &&	\
-- 
cgit v1.2.3


From 744bd8aa3c8b43447f689a27872fa95e700b8a4f Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 12 Jan 2011 17:07:28 -0200
Subject: perf record: Use struct perf_mmap and helpers

Paving the way to using perf_evsel->mmap, do this to reduce the patch
noise in the next ones.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-record.c | 25 ++++---------------------
 1 file changed, 4 insertions(+), 21 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index ec43f2eb7b72..d89e2f106a62 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -78,26 +78,9 @@ static off_t			post_processing_offset;
 static struct perf_session	*session;
 static const char		*cpu_list;
 
-struct mmap_data {
-	void			*base;
-	unsigned int		mask;
-	unsigned int		prev;
-};
-
-static struct mmap_data		mmap_array[MAX_NR_CPUS];
-
-static unsigned long mmap_read_head(struct mmap_data *md)
-{
-	struct perf_event_mmap_page *pc = md->base;
-	long head;
-
-	head = pc->data_head;
-	rmb();
-
-	return head;
-}
+static struct perf_mmap		mmap_array[MAX_NR_CPUS];
 
-static void mmap_write_tail(struct mmap_data *md, unsigned long tail)
+static void mmap_write_tail(struct perf_mmap *md, unsigned long tail)
 {
 	struct perf_event_mmap_page *pc = md->base;
 
@@ -136,9 +119,9 @@ static int process_synthesized_event(event_t *event,
 	return 0;
 }
 
-static void mmap_read(struct mmap_data *md)
+static void mmap_read(struct perf_mmap *md)
 {
-	unsigned int head = mmap_read_head(md);
+	unsigned int head = perf_mmap__read_head(md);
 	unsigned int old = md->prev;
 	unsigned char *data = md->base + page_size;
 	unsigned long size;
-- 
cgit v1.2.3


From 115d2d8963a426670ac3ce983fc4c4e001703943 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 12 Jan 2011 17:11:53 -0200
Subject: perf record: Move perf_mmap__write_tail to perf.h

Close to perf_mmap__read_head() and the perf_mmap struct definition.
This is useful for any recorder, and we will need it in 'perf test'.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-record.c | 13 +------------
 tools/perf/perf.h           | 12 ++++++++++++
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index d89e2f106a62..109f3b269ac5 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -80,17 +80,6 @@ static const char		*cpu_list;
 
 static struct perf_mmap		mmap_array[MAX_NR_CPUS];
 
-static void mmap_write_tail(struct perf_mmap *md, unsigned long tail)
-{
-	struct perf_event_mmap_page *pc = md->base;
-
-	/*
-	 * ensure all reads are done before we write the tail out.
-	 */
-	/* mb(); */
-	pc->data_tail = tail;
-}
-
 static void advance_output(size_t size)
 {
 	bytes_written += size;
@@ -165,7 +154,7 @@ static void mmap_read(struct perf_mmap *md)
 	write_output(buf, size);
 
 	md->prev = old;
-	mmap_write_tail(md, old);
+	perf_mmap__write_tail(md, old);
 }
 
 static volatile int done = 0;
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 5fb5e1f11d1c..a5fc660c1f12 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -108,6 +108,18 @@ static inline unsigned int perf_mmap__read_head(struct perf_mmap *mm)
 	return head;
 }
 
+static inline void perf_mmap__write_tail(struct perf_mmap *md,
+					 unsigned long tail)
+{
+	struct perf_event_mmap_page *pc = md->base;
+
+	/*
+	 * ensure all reads are done before we write the tail out.
+	 */
+	/* mb(); */
+	pc->data_tail = tail;
+}
+
 /*
  * prctl(PR_TASK_PERF_EVENTS_DISABLE) will (cheaply) disable all
  * counters in the current task.
-- 
cgit v1.2.3


From 70db7533caef02350ec8d6852e589491bca3a951 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 12 Jan 2011 22:39:13 -0200
Subject: perf evlist: Move the mmap array from perf_evsel

Adopting the new model used in 'perf record', where we don't have a map
per thread per cpu, instead we have an mmap per cpu, established on the
first fd for that cpu and ask the kernel using the
PERF_EVENT_IOC_SET_OUTPUT ioctl to send events for the other fds on that
cpu for the one with the mmap.

The methods moved from perf_evsel to perf_evlist, but for easing review
they were modified in place, in evsel.c, the next patch will move the
migrated methods to evlist.c.

With this 'perf top' now uses the same mmap model used by 'perf record'
and the next patches will make 'perf record' use these new routines,
establishing a common codebase for both tools.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-top.c |  56 ++++++++---------
 tools/perf/util/evlist.c |  27 ++++++++
 tools/perf/util/evlist.h |   9 +++
 tools/perf/util/evsel.c  | 160 ++++++++++++++++++++++++++++++++++-------------
 tools/perf/util/evsel.h  |  26 +++++---
 5 files changed, 201 insertions(+), 77 deletions(-)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 7d723ad0bfa9..df85c1f9417b 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -78,7 +78,7 @@ static struct cpu_map		*cpus;
 static int			realtime_prio			=      0;
 static bool			group				=  false;
 static unsigned int		page_size;
-static unsigned int		mmap_pages			=     16;
+static unsigned int		mmap_pages			=    128;
 static int			freq				=   1000; /* 1 KHz */
 
 static int			delay_secs			=      2;
@@ -991,8 +991,7 @@ static int symbol_filter(struct map *map, struct symbol *sym)
 
 static void event__process_sample(const event_t *self,
 				  struct sample_data *sample,
-				  struct perf_session *session,
-				  struct perf_evsel *evsel)
+				  struct perf_session *session)
 {
 	u64 ip = self->ip.ip;
 	struct sym_entry *syme;
@@ -1085,8 +1084,12 @@ static void event__process_sample(const event_t *self,
 
 	syme = symbol__priv(al.sym);
 	if (!syme->skip) {
-		syme->count[evsel->idx]++;
+		struct perf_evsel *evsel;
+
 		syme->origin = origin;
+		evsel = perf_evlist__id2evsel(evsel_list, sample->id);
+		assert(evsel != NULL);
+		syme->count[evsel->idx]++;
 		record_precise_ip(syme, evsel->idx, ip);
 		pthread_mutex_lock(&active_symbols_lock);
 		if (list_empty(&syme->node) || !syme->node.next)
@@ -1095,11 +1098,9 @@ static void event__process_sample(const event_t *self,
 	}
 }
 
-static void perf_session__mmap_read_counter(struct perf_session *self,
-					    struct perf_evsel *evsel,
-					    int cpu, int thread_idx)
+static void perf_session__mmap_read_cpu(struct perf_session *self, int cpu)
 {
-	struct perf_mmap *md = xyarray__entry(evsel->mmap, cpu, thread_idx);
+	struct perf_mmap *md = &evsel_list->mmap[cpu];
 	unsigned int head = perf_mmap__read_head(md);
 	unsigned int old = md->prev;
 	unsigned char *data = md->base + page_size;
@@ -1153,7 +1154,7 @@ static void perf_session__mmap_read_counter(struct perf_session *self,
 
 		event__parse_sample(event, self, &sample);
 		if (event->header.type == PERF_RECORD_SAMPLE)
-			event__process_sample(event, &sample, self, evsel);
+			event__process_sample(event, &sample, self);
 		else
 			event__process(event, &sample, self);
 		old += size;
@@ -1164,19 +1165,10 @@ static void perf_session__mmap_read_counter(struct perf_session *self,
 
 static void perf_session__mmap_read(struct perf_session *self)
 {
-	struct perf_evsel *counter;
-	int i, thread_index;
-
-	for (i = 0; i < cpus->nr; i++) {
-		list_for_each_entry(counter, &evsel_list->entries, node) {
-			for (thread_index = 0;
-				thread_index < threads->nr;
-				thread_index++) {
-				perf_session__mmap_read_counter(self,
-					counter, i, thread_index);
-			}
-		}
-	}
+	int i;
+
+	for (i = 0; i < cpus->nr; i++)
+		perf_session__mmap_read_cpu(self, i);
 }
 
 static void start_counters(struct perf_evlist *evlist)
@@ -1194,6 +1186,11 @@ static void start_counters(struct perf_evlist *evlist)
 			attr->sample_freq = freq;
 		}
 
+		if (evlist->nr_entries > 1) {
+			attr->sample_type |= PERF_SAMPLE_ID;
+			attr->read_format |= PERF_FORMAT_ID;
+		}
+
 		attr->mmap = 1;
 try_again:
 		if (perf_evsel__open(counter, cpus, threads, group, inherit) < 0) {
@@ -1225,15 +1222,16 @@ try_again:
 			die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
 			exit(-1);
 		}
-
-		if (perf_evsel__mmap(counter, cpus, threads, mmap_pages, evlist) < 0)
-			die("failed to mmap with %d (%s)\n", errno, strerror(errno));
 	}
+
+	if (perf_evlist__mmap(evlist, cpus, threads, mmap_pages, true) < 0)
+		die("failed to mmap with %d (%s)\n", errno, strerror(errno));
 }
 
 static int __cmd_top(void)
 {
 	pthread_t thread;
+	struct perf_evsel *first;
 	int ret;
 	/*
 	 * FIXME: perf_session__new should allow passing a O_MMAP, so that all this
@@ -1249,6 +1247,8 @@ static int __cmd_top(void)
 		event__synthesize_threads(event__process, session);
 
 	start_counters(evsel_list);
+	first = list_entry(evsel_list->entries.next, struct perf_evsel, node);
+	perf_session__set_sample_type(session, first->attr.sample_type);
 
 	/* Wait for a minimal set of events before starting the snapshot */
 	poll(evsel_list->pollfd, evsel_list->nr_fds, 100);
@@ -1394,8 +1394,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 		usage_with_options(top_usage, options);
 
 	list_for_each_entry(pos, &evsel_list->entries, node) {
-		if (perf_evsel__alloc_mmap(pos, cpus->nr, threads->nr) < 0 ||
-		    perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0)
+		if (perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0)
 			goto out_free_fd;
 		/*
 		 * Fill in the ones not specifically initialized via -c:
@@ -1406,7 +1405,8 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 		pos->attr.sample_period = default_interval;
 	}
 
-	if (perf_evlist__alloc_pollfd(evsel_list, cpus->nr, threads->nr) < 0)
+	if (perf_evlist__alloc_pollfd(evsel_list, cpus->nr, threads->nr) < 0 ||
+	    perf_evlist__alloc_mmap(evsel_list, cpus->nr) < 0)
 		goto out_free_fd;
 
 	sym_evsel = list_entry(evsel_list->entries.next, struct perf_evsel, node);
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 6d4129214ee8..deb82a4fc312 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -3,11 +3,18 @@
 #include "evsel.h"
 #include "util.h"
 
+#include <linux/bitops.h>
+#include <linux/hash.h>
+
 struct perf_evlist *perf_evlist__new(void)
 {
 	struct perf_evlist *evlist = zalloc(sizeof(*evlist));
 
 	if (evlist != NULL) {
+		int i;
+
+		for (i = 0; i < PERF_EVLIST__HLIST_SIZE; ++i)
+			INIT_HLIST_HEAD(&evlist->heads[i]);
 		INIT_LIST_HEAD(&evlist->entries);
 	}
 
@@ -29,6 +36,7 @@ static void perf_evlist__purge(struct perf_evlist *evlist)
 void perf_evlist__delete(struct perf_evlist *evlist)
 {
 	perf_evlist__purge(evlist);
+	free(evlist->mmap);
 	free(evlist->pollfd);
 	free(evlist);
 }
@@ -68,3 +76,22 @@ void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd)
 	evlist->pollfd[evlist->nr_fds].events = POLLIN;
 	evlist->nr_fds++;
 }
+
+struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id)
+{
+	struct hlist_head *head;
+	struct hlist_node *pos;
+	struct perf_sample_id *sid;
+	int hash;
+
+	if (evlist->nr_entries == 1)
+		return list_entry(evlist->entries.next, struct perf_evsel, node);
+
+	hash = hash_64(id, PERF_EVLIST__HLIST_BITS);
+	head = &evlist->heads[hash];
+
+	hlist_for_each_entry(sid, pos, head, node)
+		if (sid->id == id)
+			return sid->evsel;
+	return NULL;
+}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 16bbfcba8ca8..dbfcc79bb995 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -2,13 +2,20 @@
 #define __PERF_EVLIST_H 1
 
 #include <linux/list.h>
+#include "../perf.h"
 
 struct pollfd;
 
+#define PERF_EVLIST__HLIST_BITS 8
+#define PERF_EVLIST__HLIST_SIZE (1 << PERF_EVLIST__HLIST_BITS)
+
 struct perf_evlist {
 	struct list_head entries;
+	struct hlist_head heads[PERF_EVLIST__HLIST_SIZE];
 	int		 nr_entries;
 	int		 nr_fds;
+	int		 mmap_len;
+	struct perf_mmap *mmap;
 	struct pollfd	 *pollfd;
 };
 
@@ -23,4 +30,6 @@ int perf_evlist__add_default(struct perf_evlist *evlist);
 int perf_evlist__alloc_pollfd(struct perf_evlist *evlist, int ncpus, int nthreads);
 void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd);
 
+struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id);
+
 #endif /* __PERF_EVLIST_H */
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index f5006958f8da..ee490356c817 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -8,7 +8,11 @@
 #include <unistd.h>
 #include <sys/mman.h>
 
+#include <linux/bitops.h>
+#include <linux/hash.h>
+
 #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
+#define SID(e, x, y) xyarray__entry(e->id, x, y)
 
 struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx)
 {
@@ -29,6 +33,12 @@ int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
 	return evsel->fd != NULL ? 0 : -ENOMEM;
 }
 
+int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads)
+{
+	evsel->id = xyarray__new(ncpus, nthreads, sizeof(struct perf_sample_id));
+	return evsel->id != NULL ? 0 : -ENOMEM;
+}
+
 int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus)
 {
 	evsel->counts = zalloc((sizeof(*evsel->counts) +
@@ -42,6 +52,12 @@ void perf_evsel__free_fd(struct perf_evsel *evsel)
 	evsel->fd = NULL;
 }
 
+void perf_evsel__free_id(struct perf_evsel *evsel)
+{
+	xyarray__delete(evsel->id);
+	evsel->id = NULL;
+}
+
 void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
 {
 	int cpu, thread;
@@ -53,32 +69,29 @@ void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
 		}
 }
 
-void perf_evsel__munmap(struct perf_evsel *evsel, int ncpus, int nthreads)
+void perf_evlist__munmap(struct perf_evlist *evlist, int ncpus)
 {
-	struct perf_mmap *mm;
-	int cpu, thread;
+	int cpu;
 
-	for (cpu = 0; cpu < ncpus; cpu++)
-		for (thread = 0; thread < nthreads; ++thread) {
-			mm = xyarray__entry(evsel->mmap, cpu, thread);
-			if (mm->base != NULL) {
-				munmap(mm->base, evsel->mmap_len);
-				mm->base = NULL;
-			}
+	for (cpu = 0; cpu < ncpus; cpu++) {
+		if (evlist->mmap[cpu].base != NULL) {
+			munmap(evlist->mmap[cpu].base, evlist->mmap_len);
+			evlist->mmap[cpu].base = NULL;
 		}
+	}
 }
 
-int perf_evsel__alloc_mmap(struct perf_evsel *evsel, int ncpus, int nthreads)
+int perf_evlist__alloc_mmap(struct perf_evlist *evlist, int ncpus)
 {
-	evsel->mmap = xyarray__new(ncpus, nthreads, sizeof(struct perf_mmap));
-	return evsel->mmap != NULL ? 0 : -ENOMEM;
+	evlist->mmap = zalloc(ncpus * sizeof(struct perf_mmap));
+	return evlist->mmap != NULL ? 0 : -ENOMEM;
 }
 
 void perf_evsel__delete(struct perf_evsel *evsel)
 {
 	assert(list_empty(&evsel->node));
 	xyarray__delete(evsel->fd);
-	xyarray__delete(evsel->mmap);
+	xyarray__delete(evsel->id);
 	free(evsel);
 }
 
@@ -235,47 +248,110 @@ int perf_evsel__open_per_thread(struct perf_evsel *evsel,
 	return __perf_evsel__open(evsel, &empty_cpu_map.map, threads, group, inherit);
 }
 
-int perf_evsel__mmap(struct perf_evsel *evsel, struct cpu_map *cpus,
-		     struct thread_map *threads, int pages,
-		     struct perf_evlist *evlist)
+static int __perf_evlist__mmap(struct perf_evlist *evlist, int cpu, int prot,
+			       int mask, int fd)
+{
+	evlist->mmap[cpu].prev = 0;
+	evlist->mmap[cpu].mask = mask;
+	evlist->mmap[cpu].base = mmap(NULL, evlist->mmap_len, prot,
+				      MAP_SHARED, fd, 0);
+	if (evlist->mmap[cpu].base == MAP_FAILED)
+		return -1;
+
+	perf_evlist__add_pollfd(evlist, fd);
+	return 0;
+}
+
+static int perf_evlist__id_hash(struct perf_evlist *evlist, struct perf_evsel *evsel,
+			       int cpu, int thread, int fd)
+{
+	struct perf_sample_id *sid;
+	u64 read_data[4] = { 0, };
+	int hash, id_idx = 1; /* The first entry is the counter value */
+
+	if (!(evsel->attr.read_format & PERF_FORMAT_ID) ||
+	    read(fd, &read_data, sizeof(read_data)) == -1)
+		return -1;
+
+	if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		++id_idx;
+	if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		++id_idx;
+
+	sid = SID(evsel, cpu, thread);
+	sid->id = read_data[id_idx];
+	sid->evsel = evsel;
+	hash = hash_64(sid->id, PERF_EVLIST__HLIST_BITS);
+	hlist_add_head(&sid->node, &evlist->heads[hash]);
+	return 0;
+}
+
+/** perf_evlist__mmap - Create per cpu maps to receive events
+ *
+ * @evlist - list of events
+ * @cpus - cpu map being monitored
+ * @threads - threads map being monitored
+ * @pages - map length in pages
+ * @overwrite - overwrite older events?
+ *
+ * If overwrite is false the user needs to signal event consuption using:
+ *
+ *	struct perf_mmap *m = &evlist->mmap[cpu];
+ *	unsigned int head = perf_mmap__read_head(m);
+ *
+ *	perf_mmap__write_tail(m, head)
+ */
+int perf_evlist__mmap(struct perf_evlist *evlist, struct cpu_map *cpus,
+		      struct thread_map *threads, int pages, bool overwrite)
 {
 	unsigned int page_size = sysconf(_SC_PAGE_SIZE);
 	int mask = pages * page_size - 1, cpu;
-	struct perf_mmap *mm;
-	int thread;
+	struct perf_evsel *first_evsel, *evsel;
+	int thread, prot = PROT_READ | (overwrite ? 0 : PROT_WRITE);
 
-	if (evsel->mmap == NULL &&
-	    perf_evsel__alloc_mmap(evsel, cpus->nr, threads->nr) < 0)
+	if (evlist->mmap == NULL &&
+	    perf_evlist__alloc_mmap(evlist, cpus->nr) < 0)
 		return -ENOMEM;
 
-	evsel->mmap_len = (pages + 1) * page_size;
+	if (evlist->pollfd == NULL &&
+	    perf_evlist__alloc_pollfd(evlist, cpus->nr, threads->nr) < 0)
+		return -ENOMEM;
 
-	for (cpu = 0; cpu < cpus->nr; cpu++) {
-		for (thread = 0; thread < threads->nr; thread++) {
-			mm = xyarray__entry(evsel->mmap, cpu, thread);
-			mm->prev = 0;
-			mm->mask = mask;
-			mm->base = mmap(NULL, evsel->mmap_len, PROT_READ,
-					MAP_SHARED, FD(evsel, cpu, thread), 0);
-			if (mm->base == MAP_FAILED)
-				goto out_unmap;
-
-			if (evlist != NULL)
-				 perf_evlist__add_pollfd(evlist, FD(evsel, cpu, thread));
+	evlist->mmap_len = (pages + 1) * page_size;
+	first_evsel = list_entry(evlist->entries.next, struct perf_evsel, node);
+
+	list_for_each_entry(evsel, &evlist->entries, node) {
+		if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
+		    evsel->id == NULL &&
+		    perf_evsel__alloc_id(evsel, cpus->nr, threads->nr) < 0)
+			return -ENOMEM;
+
+		for (cpu = 0; cpu < cpus->nr; cpu++) {
+			for (thread = 0; thread < threads->nr; thread++) {
+				int fd = FD(evsel, cpu, thread);
+
+				if (evsel->idx || thread) {
+					if (ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT,
+						  FD(first_evsel, cpu, 0)) != 0)
+						goto out_unmap;
+				} else if (__perf_evlist__mmap(evlist, cpu, prot, mask, fd) < 0)
+					goto out_unmap;
+
+				if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
+				    perf_evlist__id_hash(evlist, evsel, cpu, thread, fd) < 0)
+					goto out_unmap;
+			}
 		}
 	}
 
 	return 0;
 
 out_unmap:
-	do {
-		while (--thread >= 0) {
-			mm = xyarray__entry(evsel->mmap, cpu, thread);
-			munmap(mm->base, evsel->mmap_len);
-			mm->base = NULL;
+	for (cpu = 0; cpu < cpus->nr; cpu++) {
+		if (evlist->mmap[cpu].base != NULL) {
+			munmap(evlist->mmap[cpu].base, evlist->mmap_len);
+			evlist->mmap[cpu].base = NULL;
 		}
-		thread = threads->nr;
-	} while (--cpu >= 0);
-
+	}
 	return -1;
 }
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index c8fbef299436..667ee4e2e35e 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -24,14 +24,25 @@ struct perf_counts {
 	struct perf_counts_values cpu[];
 };
 
+struct perf_evsel;
+
+/*
+ * Per fd, to map back from PERF_SAMPLE_ID to evsel, only used when there are
+ * more than one entry in the evlist.
+ */
+struct perf_sample_id {
+	struct hlist_node 	node;
+	u64		 	id;
+	struct perf_evsel	*evsel;
+};
+
 struct perf_evsel {
 	struct list_head	node;
 	struct perf_event_attr	attr;
 	char			*filter;
 	struct xyarray		*fd;
-	struct xyarray		*mmap;
+	struct xyarray		*id;
 	struct perf_counts	*counts;
-	size_t			mmap_len;
 	int			idx;
 	void			*priv;
 };
@@ -44,9 +55,11 @@ struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx);
 void perf_evsel__delete(struct perf_evsel *evsel);
 
 int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
+int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads);
 int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus);
-int perf_evsel__alloc_mmap(struct perf_evsel *evsel, int ncpus, int nthreads);
+int perf_evlist__alloc_mmap(struct perf_evlist *evlist, int ncpus);
 void perf_evsel__free_fd(struct perf_evsel *evsel);
+void perf_evsel__free_id(struct perf_evsel *evsel);
 void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
 
 int perf_evsel__open_per_cpu(struct perf_evsel *evsel,
@@ -55,10 +68,9 @@ int perf_evsel__open_per_thread(struct perf_evsel *evsel,
 				struct thread_map *threads, bool group, bool inherit);
 int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 		     struct thread_map *threads, bool group, bool inherit);
-int perf_evsel__mmap(struct perf_evsel *evsel, struct cpu_map *cpus,
-		     struct thread_map *threads, int pages,
-		     struct perf_evlist *evlist);
-void perf_evsel__munmap(struct perf_evsel *evsel, int ncpus, int nthreads);
+int perf_evlist__mmap(struct perf_evlist *evlist, struct cpu_map *cpus,
+		      struct thread_map *threads, int pages, bool overwrite);
+void perf_evlist__munmap(struct perf_evlist *evlist, int ncpus);
 
 #define perf_evsel__match(evsel, t, c)		\
 	(evsel->attr.type == PERF_TYPE_##t &&	\
-- 
cgit v1.2.3


From 0a27d7f9f417c0305f7efa70631764a53c7af219 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 14 Jan 2011 15:50:51 -0200
Subject: perf record: Use perf_evlist__mmap

There is more stuff that can go to the perf_ev{sel,list} layer, like
detecting if sample_id_all is available, etc, but lets try using this in
'perf test' first.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-record.c | 59 +++++++++------------------------------------
 1 file changed, 11 insertions(+), 48 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 109f3b269ac5..45a3689f9ed6 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -30,6 +30,7 @@
 #include <sys/mman.h>
 
 #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
+#define SID(e, x, y) xyarray__entry(e->id, x, y)
 
 enum write_mode_t {
 	WRITE_FORCE,
@@ -78,8 +79,6 @@ static off_t			post_processing_offset;
 static struct perf_session	*session;
 static const char		*cpu_list;
 
-static struct perf_mmap		mmap_array[MAX_NR_CPUS];
-
 static void advance_output(size_t size)
 {
 	bytes_written += size;
@@ -196,20 +195,14 @@ static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int n
 	return h_attr;
 }
 
-static void create_counter(struct perf_evlist *evlist,
-			   struct perf_evsel *evsel, int cpu)
+static void create_counter(struct perf_evsel *evsel, int cpu)
 {
 	char *filter = evsel->filter;
 	struct perf_event_attr *attr = &evsel->attr;
 	struct perf_header_attr *h_attr;
+	struct perf_sample_id *sid;
 	int thread_index;
 	int ret;
-	struct {
-		u64 count;
-		u64 time_enabled;
-		u64 time_running;
-		u64 id;
-	} read_data;
 
 	for (thread_index = 0; thread_index < threads->nr; thread_index++) {
 		h_attr = get_header_attr(attr, evsel->idx);
@@ -223,45 +216,12 @@ static void create_counter(struct perf_evlist *evlist,
 			}
 		}
 
-		if (read(FD(evsel, cpu, thread_index), &read_data, sizeof(read_data)) == -1) {
-			perror("Unable to read perf file descriptor");
-			exit(-1);
-		}
-
-		if (perf_header_attr__add_id(h_attr, read_data.id) < 0) {
+		sid = SID(evsel, cpu, thread_index);
+		if (perf_header_attr__add_id(h_attr, sid->id) < 0) {
 			pr_warning("Not enough memory to add id\n");
 			exit(-1);
 		}
 
-		assert(FD(evsel, cpu, thread_index) >= 0);
-		fcntl(FD(evsel, cpu, thread_index), F_SETFL, O_NONBLOCK);
-
-		if (evsel->idx || thread_index) {
-			struct perf_evsel *first;
-			first = list_entry(evlist->entries.next, struct perf_evsel, node);
-			ret = ioctl(FD(evsel, cpu, thread_index),
-				    PERF_EVENT_IOC_SET_OUTPUT,
-				    FD(first, cpu, 0));
-			if (ret) {
-				error("failed to set output: %d (%s)\n", errno,
-						strerror(errno));
-				exit(-1);
-			}
-		} else {
-			mmap_array[cpu].prev = 0;
-			mmap_array[cpu].mask = mmap_pages*page_size - 1;
-			mmap_array[cpu].base = mmap(NULL, (mmap_pages+1)*page_size,
-				PROT_READ | PROT_WRITE, MAP_SHARED, FD(evsel, cpu, thread_index), 0);
-			if (mmap_array[cpu].base == MAP_FAILED) {
-				error("failed to mmap with %d (%s)\n", errno, strerror(errno));
-				exit(-1);
-			}
-
-			evlist->pollfd[evlist->nr_fds].fd = FD(evsel, cpu, thread_index);
-			evlist->pollfd[evlist->nr_fds].events = POLLIN;
-			evlist->nr_fds++;
-		}
-
 		if (filter != NULL) {
 			ret = ioctl(FD(evsel, cpu, thread_index),
 				    PERF_EVENT_IOC_SET_FILTER, filter);
@@ -423,9 +383,12 @@ try_again:
 		}
 	}
 
+	if (perf_evlist__mmap(evlist, cpus, threads, mmap_pages, false) < 0)
+		die("failed to mmap with %d (%s)\n", errno, strerror(errno));
+
 	for (cpu = 0; cpu < cpus->nr; ++cpu) {
 		list_for_each_entry(pos, &evlist->entries, node)
-			create_counter(evlist, pos, cpu);
+			create_counter(pos, cpu);
 	}
 }
 
@@ -502,8 +465,8 @@ static void mmap_read_all(void)
 	int i;
 
 	for (i = 0; i < cpus->nr; i++) {
-		if (mmap_array[i].base)
-			mmap_read(&mmap_array[i]);
+		if (evsel_list->mmap[i].base)
+			mmap_read(&evsel_list->mmap[i]);
 	}
 
 	if (perf_header__has_feat(&session->header, HEADER_TRACE_INFO))
-- 
cgit v1.2.3


From 915fce20ecf8f7ff4189d0fff42b62aebf6a57cc Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 14 Jan 2011 16:19:12 -0200
Subject: perf tools: Add missing cpu_map__delete()

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/cpumap.c | 5 +++++
 tools/perf/util/cpumap.h | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 3ccaa1043383..6893eec693ab 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -177,3 +177,8 @@ struct cpu_map *cpu_map__dummy_new(void)
 
 	return cpus;
 }
+
+void cpu_map__delete(struct cpu_map *map)
+{
+	free(map);
+}
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index f7a4f42f6307..072c0a374794 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -8,6 +8,6 @@ struct cpu_map {
 
 struct cpu_map *cpu_map__new(const char *cpu_list);
 struct cpu_map *cpu_map__dummy_new(void);
-void *cpu_map__delete(struct cpu_map *map);
+void cpu_map__delete(struct cpu_map *map);
 
 #endif /* __PERF_CPUMAP_H */
-- 
cgit v1.2.3


From d2af9687c96f3864178de1860e6d83873aeef224 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 14 Jan 2011 16:24:49 -0200
Subject: perf test: Check counts on all cpus in
 test__open_syscall_event_on_all_cpus

We were bailing out after the first count mismatch, do it in all to see
if only some CPUs are not getting the expected number of events.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-test.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c
index 7287158c4830..7cc6b2086947 100644
--- a/tools/perf/builtin-test.c
+++ b/tools/perf/builtin-test.c
@@ -408,6 +408,8 @@ static int test__open_syscall_event_on_all_cpus(void)
 		goto out_close_fd;
 	}
 
+	err = 0;
+
 	for (cpu = 0; cpu < cpus->nr; ++cpu) {
 		unsigned int expected;
 
@@ -416,18 +418,18 @@ static int test__open_syscall_event_on_all_cpus(void)
 
 		if (perf_evsel__read_on_cpu(evsel, cpu, 0) < 0) {
 			pr_debug("perf_evsel__open_read_on_cpu\n");
-			goto out_close_fd;
+			err = -1;
+			break;
 		}
 
 		expected = nr_open_calls + cpu;
 		if (evsel->counts->cpu[cpu].val != expected) {
 			pr_debug("perf_evsel__read_on_cpu: expected to intercept %d calls on cpu %d, got %" PRIu64 "\n",
 				 expected, cpus->map[cpu], evsel->counts->cpu[cpu].val);
-			goto out_close_fd;
+			err = -1;
 		}
 	}
 
-	err = 0;
 out_close_fd:
 	perf_evsel__close_fd(evsel, 1, threads->nr);
 out_evsel_delete:
-- 
cgit v1.2.3


From 98d77b78504a423fca911a26a17bee00ef2fdda2 Mon Sep 17 00:00:00 2001
From: Han Pingtian <phan@redhat.com>
Date: Sat, 15 Jan 2011 07:00:50 +0800
Subject: perf test: check if cpu_map__new() return NULL

It looks like we should check if cpus is NULL after

	cpus = cpu_map__new(NULL);

in test__open_syscall_event_on_all_cpus().

LKML-Reference: <20110114230050.GA7011@localhost>
Signed-off-by: Han Pingtian <phan@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-test.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c
index 7cc6b2086947..5a50e4755e6c 100644
--- a/tools/perf/builtin-test.c
+++ b/tools/perf/builtin-test.c
@@ -347,9 +347,9 @@ static int test__open_syscall_event_on_all_cpus(void)
 	}
 
 	cpus = cpu_map__new(NULL);
-	if (threads == NULL) {
-		pr_debug("thread_map__new\n");
-		return -1;
+	if (cpus == NULL) {
+		pr_debug("cpu_map__new\n");
+		goto out_thread_map_delete;
 	}
 
 
-- 
cgit v1.2.3


From 04391debc3e1195222a4dbb162ace6542dd89c1c Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 15 Jan 2011 10:40:59 -0200
Subject: perf evlist: Steal mmap reading routine from 'perf top'

Will be used in the upcoming 'perf test' entry for the evlist mmap
routines.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-top.c | 56 +++----------------------------------------
 tools/perf/util/evlist.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/evlist.h |  4 ++++
 3 files changed, 69 insertions(+), 53 deletions(-)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index df85c1f9417b..58352ad807c7 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1100,67 +1100,17 @@ static void event__process_sample(const event_t *self,
 
 static void perf_session__mmap_read_cpu(struct perf_session *self, int cpu)
 {
-	struct perf_mmap *md = &evsel_list->mmap[cpu];
-	unsigned int head = perf_mmap__read_head(md);
-	unsigned int old = md->prev;
-	unsigned char *data = md->base + page_size;
 	struct sample_data sample;
-	int diff;
-
-	/*
-	 * If we're further behind than half the buffer, there's a chance
-	 * the writer will bite our tail and mess up the samples under us.
-	 *
-	 * If we somehow ended up ahead of the head, we got messed up.
-	 *
-	 * In either case, truncate and restart at head.
-	 */
-	diff = head - old;
-	if (diff > md->mask / 2 || diff < 0) {
-		fprintf(stderr, "WARNING: failed to keep up with mmap data.\n");
-
-		/*
-		 * head points to a known good entry, start there.
-		 */
-		old = head;
-	}
-
-	for (; old != head;) {
-		event_t *event = (event_t *)&data[old & md->mask];
-
-		event_t event_copy;
-
-		size_t size = event->header.size;
-
-		/*
-		 * Event straddles the mmap boundary -- header should always
-		 * be inside due to u64 alignment of output.
-		 */
-		if ((old & md->mask) + size != ((old + size) & md->mask)) {
-			unsigned int offset = old;
-			unsigned int len = min(sizeof(*event), size), cpy;
-			void *dst = &event_copy;
-
-			do {
-				cpy = min(md->mask + 1 - (offset & md->mask), len);
-				memcpy(dst, &data[offset & md->mask], cpy);
-				offset += cpy;
-				dst += cpy;
-				len -= cpy;
-			} while (len);
-
-			event = &event_copy;
-		}
+	event_t *event;
 
+	while ((event = perf_evlist__read_on_cpu(evsel_list, cpu)) != NULL) {
 		event__parse_sample(event, self, &sample);
+
 		if (event->header.type == PERF_RECORD_SAMPLE)
 			event__process_sample(event, &sample, self);
 		else
 			event__process(event, &sample, self);
-		old += size;
 	}
-
-	md->prev = old;
 }
 
 static void perf_session__mmap_read(struct perf_session *self)
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index deb82a4fc312..4b3b84cd71a1 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -95,3 +95,65 @@ struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id)
 			return sid->evsel;
 	return NULL;
 }
+
+event_t *perf_evlist__read_on_cpu(struct perf_evlist *evlist, int cpu)
+{
+	/* XXX Move this to perf.c, making it generally available */
+	unsigned int page_size = sysconf(_SC_PAGE_SIZE);
+	struct perf_mmap *md = &evlist->mmap[cpu];
+	unsigned int head = perf_mmap__read_head(md);
+	unsigned int old = md->prev;
+	unsigned char *data = md->base + page_size;
+	event_t *event = NULL;
+	int diff;
+
+	/*
+	 * If we're further behind than half the buffer, there's a chance
+	 * the writer will bite our tail and mess up the samples under us.
+	 *
+	 * If we somehow ended up ahead of the head, we got messed up.
+	 *
+	 * In either case, truncate and restart at head.
+	 */
+	diff = head - old;
+	if (diff > md->mask / 2 || diff < 0) {
+		fprintf(stderr, "WARNING: failed to keep up with mmap data.\n");
+
+		/*
+		 * head points to a known good entry, start there.
+		 */
+		old = head;
+	}
+
+	if (old != head) {
+		size_t size;
+
+		event = (event_t *)&data[old & md->mask];
+		size = event->header.size;
+
+		/*
+		 * Event straddles the mmap boundary -- header should always
+		 * be inside due to u64 alignment of output.
+		 */
+		if ((old & md->mask) + size != ((old + size) & md->mask)) {
+			unsigned int offset = old;
+			unsigned int len = min(sizeof(*event), size), cpy;
+			void *dst = &evlist->event_copy;
+
+			do {
+				cpy = min(md->mask + 1 - (offset & md->mask), len);
+				memcpy(dst, &data[offset & md->mask], cpy);
+				offset += cpy;
+				dst += cpy;
+				len -= cpy;
+			} while (len);
+
+			event = &evlist->event_copy;
+		}
+
+		old += size;
+	}
+
+	md->prev = old;
+	return event;
+}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index dbfcc79bb995..28712063db97 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -3,6 +3,7 @@
 
 #include <linux/list.h>
 #include "../perf.h"
+#include "event.h"
 
 struct pollfd;
 
@@ -15,6 +16,7 @@ struct perf_evlist {
 	int		 nr_entries;
 	int		 nr_fds;
 	int		 mmap_len;
+	event_t		 event_copy;
 	struct perf_mmap *mmap;
 	struct pollfd	 *pollfd;
 };
@@ -32,4 +34,6 @@ void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd);
 
 struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id);
 
+event_t *perf_evlist__read_on_cpu(struct perf_evlist *self, int cpu);
+
 #endif /* __PERF_EVLIST_H */
-- 
cgit v1.2.3


From de5fa3a8a05cd60f59622e88cfeb90416760d78e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 15 Jan 2011 10:42:46 -0200
Subject: perf test: Add test for the evlist mmap routines

This test will generate random numbers of calls to some getpid syscalls,
then establish an mmap for a group of events that are created to monitor
these syscalls.

It will receive the events, using mmap, use its PERF_SAMPLE_ID generated
sample.id field to map back to its respective perf_evsel instance.

Then it checks if the number of syscalls reported as perf events by the
kernel corresponds to the number of syscalls made.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-test.c | 171 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 169 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c
index 5a50e4755e6c..4fd34537c01d 100644
--- a/tools/perf/builtin-test.c
+++ b/tools/perf/builtin-test.c
@@ -7,7 +7,9 @@
 
 #include "util/cache.h"
 #include "util/debug.h"
+#include "util/evlist.h"
 #include "util/parse-options.h"
+#include "util/parse-events.h"
 #include "util/session.h"
 #include "util/symbol.h"
 #include "util/thread.h"
@@ -238,14 +240,14 @@ out:
 #include "util/evsel.h"
 #include <sys/types.h>
 
-static int trace_event__id(const char *event_name)
+static int trace_event__id(const char *evname)
 {
 	char *filename;
 	int err = -1, fd;
 
 	if (asprintf(&filename,
 		     "/sys/kernel/debug/tracing/events/syscalls/%s/id",
-		     event_name) < 0)
+		     evname) < 0)
 		return -1;
 
 	fd = open(filename, O_RDONLY);
@@ -439,6 +441,167 @@ out_thread_map_delete:
 	return err;
 }
 
+/*
+ * This test will generate random numbers of calls to some getpid syscalls,
+ * then establish an mmap for a group of events that are created to monitor
+ * the syscalls.
+ *
+ * It will receive the events, using mmap, use its PERF_SAMPLE_ID generated
+ * sample.id field to map back to its respective perf_evsel instance.
+ *
+ * Then it checks if the number of syscalls reported as perf events by
+ * the kernel corresponds to the number of syscalls made.
+ */
+static int test__basic_mmap(void)
+{
+	int err = -1;
+	event_t *event;
+	struct thread_map *threads;
+	struct perf_session session;
+	struct cpu_map *cpus;
+	struct perf_evlist *evlist;
+	struct perf_event_attr attr = {
+		.type		= PERF_TYPE_TRACEPOINT,
+		.read_format	= PERF_FORMAT_ID,
+		.sample_type	= PERF_SAMPLE_ID,
+		.watermark	= 0,
+	};
+	cpu_set_t cpu_set;
+	const char *syscall_names[] = { "getsid", "getppid", "getpgrp",
+					"getpgid", };
+	pid_t (*syscalls[])(void) = { (void *)getsid, getppid, getpgrp,
+				      (void*)getpgid };
+#define nsyscalls ARRAY_SIZE(syscall_names)
+	int ids[nsyscalls];
+	unsigned int nr_events[nsyscalls],
+		     expected_nr_events[nsyscalls], i, j;
+	struct perf_evsel *evsels[nsyscalls], *evsel;
+
+	for (i = 0; i < nsyscalls; ++i) {
+		char name[64];
+
+		snprintf(name, sizeof(name), "sys_enter_%s", syscall_names[i]);
+		ids[i] = trace_event__id(name);
+		if (ids[i] < 0) {
+			pr_debug("Is debugfs mounted on /sys/kernel/debug?\n");
+			return -1;
+		}
+		nr_events[i] = 0;
+		expected_nr_events[i] = random() % 257;
+	}
+
+	threads = thread_map__new(-1, getpid());
+	if (threads == NULL) {
+		pr_debug("thread_map__new\n");
+		return -1;
+	}
+
+	cpus = cpu_map__new(NULL);
+	if (threads == NULL) {
+		pr_debug("thread_map__new\n");
+		goto out_free_threads;
+	}
+
+	CPU_ZERO(&cpu_set);
+	CPU_SET(cpus->map[0], &cpu_set);
+	sched_setaffinity(0, sizeof(cpu_set), &cpu_set);
+	if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) < 0) {
+		pr_debug("sched_setaffinity() failed on CPU %d: %s ",
+			 cpus->map[0], strerror(errno));
+		goto out_free_cpus;
+	}
+
+	evlist = perf_evlist__new();
+	if (threads == NULL) {
+		pr_debug("perf_evlist__new\n");
+		goto out_free_cpus;
+	}
+
+	/* anonymous union fields, can't be initialized above */
+	attr.wakeup_events = 1;
+	attr.sample_period = 1;
+
+	/*
+ 	 * FIXME: use evsel->attr.sample_type in event__parse_sample.
+ 	 * 	  This will nicely remove the requirement that we have
+ 	 * 	  all the events with the same sample_type.
+ 	 */
+	session.sample_type = attr.sample_type;
+
+	for (i = 0; i < nsyscalls; ++i) {
+		attr.config = ids[i];
+		evsels[i] = perf_evsel__new(&attr, i);
+		if (evsels[i] == NULL) {
+			pr_debug("perf_evsel__new\n");
+			goto out_free_evlist;
+		}
+
+		perf_evlist__add(evlist, evsels[i]);
+
+		if (perf_evsel__open(evsels[i], cpus, threads, false, false) < 0) {
+			pr_debug("failed to open counter: %s, "
+				 "tweak /proc/sys/kernel/perf_event_paranoid?\n",
+				 strerror(errno));
+			goto out_close_fd;
+		}
+	}
+
+	if (perf_evlist__mmap(evlist, cpus, threads, 128, true) < 0) {
+		pr_debug("failed to mmap events: %d (%s)\n", errno,
+			 strerror(errno));
+		goto out_close_fd;
+	}
+
+	for (i = 0; i < nsyscalls; ++i)
+		for (j = 0; j < expected_nr_events[i]; ++j) {
+			int foo = syscalls[i]();
+			++foo;
+		}
+
+	while ((event = perf_evlist__read_on_cpu(evlist, 0)) != NULL) {
+		struct sample_data sample;
+
+		if (event->header.type != PERF_RECORD_SAMPLE) {
+			pr_debug("unexpected %s event\n",
+				 event__get_event_name(event->header.type));
+			goto out_munmap;
+		}
+
+		event__parse_sample(event, &session, &sample);
+		evsel = perf_evlist__id2evsel(evlist, sample.id);
+		if (evsel == NULL) {
+			pr_debug("event with id %" PRIu64
+				 " doesn't map to an evsel\n", sample.id);
+			goto out_munmap;
+		}
+		nr_events[evsel->idx]++;
+	}
+
+	list_for_each_entry(evsel, &evlist->entries, node) {
+		if (nr_events[evsel->idx] != expected_nr_events[evsel->idx]) {
+			pr_debug("expected %d %s events, got %d\n",
+				 expected_nr_events[evsel->idx],
+				 event_name(evsel), nr_events[evsel->idx]);
+			goto out_munmap;
+		}
+	}
+
+	err = 0;
+out_munmap:
+	perf_evlist__munmap(evlist, 1);
+out_close_fd:
+	for (i = 0; i < nsyscalls; ++i)
+		perf_evsel__close_fd(evsels[i], 1, threads->nr);
+out_free_evlist:
+	perf_evlist__delete(evlist);
+out_free_cpus:
+	cpu_map__delete(cpus);
+out_free_threads:
+	thread_map__delete(threads);
+	return err;
+#undef nsyscalls
+}
+
 static struct test {
 	const char *desc;
 	int (*func)(void);
@@ -455,6 +618,10 @@ static struct test {
 		.desc = "detect open syscall event on all cpus",
 		.func = test__open_syscall_event_on_all_cpus,
 	},
+	{
+		.desc = "read samples using the mmap interface",
+		.func = test__basic_mmap,
+	},
 	{
 		.func = NULL,
 	},
-- 
cgit v1.2.3


From 1b3a0e9592ebf174af934b3908a2bf6a6fa86169 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 14 Jan 2011 04:51:58 +0100
Subject: perf callchain: Feed callchains into a cursor

The callchains are fed with an array of a fixed size.
As a result we iterate over each callchains three times:

- 1st to resolve symbols
- 2nd to filter out context boundaries
- 3rd for the insertion into the tree

This also involves some pairs of memory allocation/deallocation
everytime we insert a callchain, for the filtered out array of
addresses and for the array of symbols that comes along.

Instead, feed the callchains through a linked list with persistent
allocations. It brings several pros like:

- Merge the 1st and 2nd iterations in one. That was possible before
but in a way that would involve allocating an array slightly taller
than necessary because we don't know in advance the number of context
boundaries to filter out.

- Much lesser allocations/deallocations. The linked list keeps
persistent empty entries for the next usages and is extendable at
will.

- Makes it easier for multiple sources of callchains to feed a
stacktrace together. This is deemed to pave the way for cfi based
callchains wherein traditional frame pointer based kernel
stacktraces will precede cfi based user ones, producing an overall
callchain which size is hardly predictable. This requirement
makes the static array obsolete and makes a linked list based
iterator a much more flexible fit.

Basic testing on a big perf file containing callchains (~ 176 MB)
has shown a throughput gain of about 11% with perf report.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1294977121-5700-2-git-send-email-fweisbec@gmail.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-report.c |  25 +++---
 tools/perf/util/callchain.c | 204 ++++++++++++++++++++++----------------------
 tools/perf/util/callchain.h |  66 +++++++++++++-
 tools/perf/util/hist.c      |  13 ++-
 tools/perf/util/hist.h      |   2 +
 tools/perf/util/session.c   |  22 ++---
 tools/perf/util/session.h   |  11 +--
 7 files changed, 204 insertions(+), 139 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index c27e31f289e6..c95599a82f9e 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -81,18 +81,17 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 					struct addr_location *al,
 					struct sample_data *data)
 {
-	struct map_symbol *syms = NULL;
 	struct symbol *parent = NULL;
-	int err = -ENOMEM;
+	int err = 0;
 	struct hist_entry *he;
 	struct hists *hists;
 	struct perf_event_attr *attr;
 
 	if ((sort__has_parent || symbol_conf.use_callchain) && data->callchain) {
-		syms = perf_session__resolve_callchain(self, al->thread,
-						       data->callchain, &parent);
-		if (syms == NULL)
-			return -ENOMEM;
+		err = perf_session__resolve_callchain(self, al->thread,
+						      data->callchain, &parent);
+		if (err)
+			return err;
 	}
 
 	attr = perf_header__find_attr(data->id, &self->header);
@@ -101,16 +100,17 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 	else
 		hists = perf_session__hists_findnew(self, data->id, 0, 0);
 	if (hists == NULL)
-		goto out_free_syms;
+		return -ENOMEM;
+
 	he = __hists__add_entry(hists, al, parent, data->period);
 	if (he == NULL)
-		goto out_free_syms;
-	err = 0;
+		return -ENOMEM;
+
 	if (symbol_conf.use_callchain) {
-		err = callchain_append(he->callchain, data->callchain, syms,
+		err = callchain_append(he->callchain, &self->callchain_cursor,
 				       data->period);
 		if (err)
-			goto out_free_syms;
+			return err;
 	}
 	/*
 	 * Only in the newt browser we are doing integrated annotation,
@@ -119,8 +119,7 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 	 */
 	if (use_browser > 0)
 		err = hist_entry__inc_addr_samples(he, al->addr);
-out_free_syms:
-	free(syms);
+
 	return err;
 }
 
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index e12d539417b2..53a49e0cfc6c 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2009-2010, Frederic Weisbecker <fweisbec@gmail.com>
+ * Copyright (C) 2009-2011, Frederic Weisbecker <fweisbec@gmail.com>
  *
  * Handle the callchains from the stream in an ad-hoc radix tree and then
  * sort them in an rbtree.
@@ -195,26 +195,21 @@ create_child(struct callchain_node *parent, bool inherit_children)
 }
 
 
-struct resolved_ip {
-	u64		  ip;
-	struct map_symbol ms;
-};
-
-struct resolved_chain {
-	u64			nr;
-	struct resolved_ip	ips[0];
-};
-
-
 /*
  * Fill the node with callchain values
  */
 static void
-fill_node(struct callchain_node *node, struct resolved_chain *chain, int start)
+fill_node(struct callchain_node *node, struct callchain_cursor *cursor)
 {
-	unsigned int i;
+	struct callchain_cursor_node *cursor_node;
+
+	node->val_nr = cursor->nr - cursor->pos;
+	if (!node->val_nr)
+		pr_warning("Warning: empty node in callchain tree\n");
 
-	for (i = start; i < chain->nr; i++) {
+	cursor_node = callchain_cursor_current(cursor);
+
+	while (cursor_node) {
 		struct callchain_list *call;
 
 		call = zalloc(sizeof(*call));
@@ -222,23 +217,25 @@ fill_node(struct callchain_node *node, struct resolved_chain *chain, int start)
 			perror("not enough memory for the code path tree");
 			return;
 		}
-		call->ip = chain->ips[i].ip;
-		call->ms = chain->ips[i].ms;
+		call->ip = cursor_node->ip;
+		call->ms.sym = cursor_node->sym;
+		call->ms.map = cursor_node->map;
 		list_add_tail(&call->list, &node->val);
+
+		callchain_cursor_advance(cursor);
+		cursor_node = callchain_cursor_current(cursor);
 	}
-	node->val_nr = chain->nr - start;
-	if (!node->val_nr)
-		pr_warning("Warning: empty node in callchain tree\n");
 }
 
 static void
-add_child(struct callchain_node *parent, struct resolved_chain *chain,
-	  int start, u64 period)
+add_child(struct callchain_node *parent,
+	  struct callchain_cursor *cursor,
+	  u64 period)
 {
 	struct callchain_node *new;
 
 	new = create_child(parent, false);
-	fill_node(new, chain, start);
+	fill_node(new, cursor);
 
 	new->children_hit = 0;
 	new->hit = period;
@@ -250,9 +247,10 @@ add_child(struct callchain_node *parent, struct resolved_chain *chain,
  * Then create another child to host the given callchain of new branch
  */
 static void
-split_add_child(struct callchain_node *parent, struct resolved_chain *chain,
-		struct callchain_list *to_split, int idx_parents, int idx_local,
-		u64 period)
+split_add_child(struct callchain_node *parent,
+		struct callchain_cursor *cursor,
+		struct callchain_list *to_split,
+		u64 idx_parents, u64 idx_local, u64 period)
 {
 	struct callchain_node *new;
 	struct list_head *old_tail;
@@ -277,9 +275,9 @@ split_add_child(struct callchain_node *parent, struct resolved_chain *chain,
 	parent->val_nr = idx_local;
 
 	/* create a new child for the new branch if any */
-	if (idx_total < chain->nr) {
+	if (idx_total < cursor->nr) {
 		parent->hit = 0;
-		add_child(parent, chain, idx_total, period);
+		add_child(parent, cursor, period);
 		parent->children_hit += period;
 	} else {
 		parent->hit = period;
@@ -287,36 +285,41 @@ split_add_child(struct callchain_node *parent, struct resolved_chain *chain,
 }
 
 static int
-append_chain(struct callchain_node *root, struct resolved_chain *chain,
-	     unsigned int start, u64 period);
+append_chain(struct callchain_node *root,
+	     struct callchain_cursor *cursor,
+	     u64 period);
 
 static void
-append_chain_children(struct callchain_node *root, struct resolved_chain *chain,
-		      unsigned int start, u64 period)
+append_chain_children(struct callchain_node *root,
+		      struct callchain_cursor *cursor,
+		      u64 period)
 {
 	struct callchain_node *rnode;
 
 	/* lookup in childrens */
 	chain_for_each_child(rnode, root) {
-		unsigned int ret = append_chain(rnode, chain, start, period);
+		unsigned int ret = append_chain(rnode, cursor, period);
 
 		if (!ret)
 			goto inc_children_hit;
 	}
 	/* nothing in children, add to the current node */
-	add_child(root, chain, start, period);
+	add_child(root, cursor, period);
 
 inc_children_hit:
 	root->children_hit += period;
 }
 
 static int
-append_chain(struct callchain_node *root, struct resolved_chain *chain,
-	     unsigned int start, u64 period)
+append_chain(struct callchain_node *root,
+	     struct callchain_cursor *cursor,
+	     u64 period)
 {
+	struct callchain_cursor_node *curr_snap = cursor->curr;
 	struct callchain_list *cnode;
-	unsigned int i = start;
+	u64 start = cursor->pos;
 	bool found = false;
+	u64 matches;
 
 	/*
 	 * Lookup in the current node
@@ -324,114 +327,95 @@ append_chain(struct callchain_node *root, struct resolved_chain *chain,
 	 * anywhere inside a function.
 	 */
 	list_for_each_entry(cnode, &root->val, list) {
+		struct callchain_cursor_node *node;
 		struct symbol *sym;
 
-		if (i == chain->nr)
+		node = callchain_cursor_current(cursor);
+		if (!node)
 			break;
 
-		sym = chain->ips[i].ms.sym;
+		sym = node->sym;
 
 		if (cnode->ms.sym && sym) {
 			if (cnode->ms.sym->start != sym->start)
 				break;
-		} else if (cnode->ip != chain->ips[i].ip)
+		} else if (cnode->ip != node->ip)
 			break;
 
 		if (!found)
 			found = true;
-		i++;
+
+		callchain_cursor_advance(cursor);
 	}
 
 	/* matches not, relay on the parent */
-	if (!found)
+	if (!found) {
+		cursor->curr = curr_snap;
+		cursor->pos = start;
 		return -1;
+	}
+
+	matches = cursor->pos - start;
 
 	/* we match only a part of the node. Split it and add the new chain */
-	if (i - start < root->val_nr) {
-		split_add_child(root, chain, cnode, start, i - start, period);
+	if (matches < root->val_nr) {
+		split_add_child(root, cursor, cnode, start, matches, period);
 		return 0;
 	}
 
 	/* we match 100% of the path, increment the hit */
-	if (i - start == root->val_nr && i == chain->nr) {
+	if (matches == root->val_nr && cursor->pos == cursor->nr) {
 		root->hit += period;
 		return 0;
 	}
 
 	/* We match the node and still have a part remaining */
-	append_chain_children(root, chain, i, period);
+	append_chain_children(root, cursor, period);
 
 	return 0;
 }
 
-static void filter_context(struct ip_callchain *old, struct resolved_chain *new,
-			   struct map_symbol *syms)
-{
-	int i, j = 0;
-
-	for (i = 0; i < (int)old->nr; i++) {
-		if (old->ips[i] >= PERF_CONTEXT_MAX)
-			continue;
-
-		new->ips[j].ip = old->ips[i];
-		new->ips[j].ms = syms[i];
-		j++;
-	}
-
-	new->nr = j;
-}
-
-
-int callchain_append(struct callchain_root *root, struct ip_callchain *chain,
-		     struct map_symbol *syms, u64 period)
+int callchain_append(struct callchain_root *root,
+		     struct callchain_cursor *cursor,
+		     u64 period)
 {
-	struct resolved_chain *filtered;
-
-	if (!chain->nr)
+	if (!cursor->nr)
 		return 0;
 
-	filtered = zalloc(sizeof(*filtered) +
-			  chain->nr * sizeof(struct resolved_ip));
-	if (!filtered)
-		return -ENOMEM;
-
-	filter_context(chain, filtered, syms);
-
-	if (!filtered->nr)
-		goto end;
+	callchain_cursor_commit(cursor);
 
-	append_chain_children(&root->node, filtered, 0, period);
+	append_chain_children(&root->node, cursor, period);
 
-	if (filtered->nr > root->max_depth)
-		root->max_depth = filtered->nr;
-end:
-	free(filtered);
+	if (cursor->nr > root->max_depth)
+		root->max_depth = cursor->nr;
 
 	return 0;
 }
 
 static int
-merge_chain_branch(struct callchain_node *dst, struct callchain_node *src,
-		   struct resolved_chain *chain)
+merge_chain_branch(struct callchain_cursor *cursor,
+		   struct callchain_node *dst, struct callchain_node *src)
 {
+	struct callchain_cursor_node **old_last = cursor->last;
 	struct callchain_node *child, *next_child;
 	struct callchain_list *list, *next_list;
-	int old_pos = chain->nr;
+	int old_pos = cursor->nr;
 	int err = 0;
 
 	list_for_each_entry_safe(list, next_list, &src->val, list) {
-		chain->ips[chain->nr].ip = list->ip;
-		chain->ips[chain->nr].ms = list->ms;
-		chain->nr++;
+		callchain_cursor_append(cursor, list->ip,
+					list->ms.map, list->ms.sym);
 		list_del(&list->list);
 		free(list);
 	}
 
-	if (src->hit)
-		append_chain_children(dst, chain, 0, src->hit);
+	if (src->hit) {
+		callchain_cursor_commit(cursor);
+		append_chain_children(dst, cursor, src->hit);
+	}
 
 	chain_for_each_child_safe(child, next_child, src) {
-		err = merge_chain_branch(dst, child, chain);
+		err = merge_chain_branch(cursor, dst, child);
 		if (err)
 			break;
 
@@ -439,26 +423,38 @@ merge_chain_branch(struct callchain_node *dst, struct callchain_node *src,
 		free(child);
 	}
 
-	chain->nr = old_pos;
+	cursor->nr = old_pos;
+	cursor->last = old_last;
 
 	return err;
 }
 
-int callchain_merge(struct callchain_root *dst, struct callchain_root *src)
+int callchain_merge(struct callchain_cursor *cursor,
+		    struct callchain_root *dst, struct callchain_root *src)
+{
+	return merge_chain_branch(cursor, &dst->node, &src->node);
+}
+
+int callchain_cursor_append(struct callchain_cursor *cursor,
+			    u64 ip, struct map *map, struct symbol *sym)
 {
-	struct resolved_chain *chain;
-	int err;
+	struct callchain_cursor_node *node = *cursor->last;
 
-	chain = malloc(sizeof(*chain) +
-		       src->max_depth * sizeof(struct resolved_ip));
-	if (!chain)
-		return -ENOMEM;
+	if (!node) {
+		node = calloc(sizeof(*node), 1);
+		if (!node)
+			return -ENOMEM;
 
-	chain->nr = 0;
+		*cursor->last = node;
+	}
 
-	err = merge_chain_branch(&dst->node, &src->node, chain);
+	node->ip = ip;
+	node->map = map;
+	node->sym = sym;
 
-	free(chain);
+	cursor->nr++;
 
-	return err;
+	cursor->last = &node->next;
+
+	return 0;
 }
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index c15fb8c24ad2..d74a19af4a44 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -49,6 +49,27 @@ struct callchain_list {
 	struct list_head	list;
 };
 
+/*
+ * A callchain cursor is a single linked list that
+ * let one feed a callchain progressively.
+ * It keeps persitent allocated entries to minimize
+ * allocations.
+ */
+struct callchain_cursor_node {
+	u64				ip;
+	struct map			*map;
+	struct symbol			*sym;
+	struct callchain_cursor_node	*next;
+};
+
+struct callchain_cursor {
+	u64				nr;
+	struct callchain_cursor_node	*first;
+	struct callchain_cursor_node	**last;
+	u64				pos;
+	struct callchain_cursor_node	*curr;
+};
+
 static inline void callchain_init(struct callchain_root *root)
 {
 	INIT_LIST_HEAD(&root->node.brothers);
@@ -67,9 +88,48 @@ static inline u64 cumul_hits(struct callchain_node *node)
 }
 
 int register_callchain_param(struct callchain_param *param);
-int callchain_append(struct callchain_root *root, struct ip_callchain *chain,
-		     struct map_symbol *syms, u64 period);
-int callchain_merge(struct callchain_root *dst, struct callchain_root *src);
+int callchain_append(struct callchain_root *root,
+		     struct callchain_cursor *cursor,
+		     u64 period);
+
+int callchain_merge(struct callchain_cursor *cursor,
+		    struct callchain_root *dst, struct callchain_root *src);
 
 bool ip_callchain__valid(struct ip_callchain *chain, const event_t *event);
+
+/*
+ * Initialize a cursor before adding entries inside, but keep
+ * the previously allocated entries as a cache.
+ */
+static inline void callchain_cursor_reset(struct callchain_cursor *cursor)
+{
+	cursor->nr = 0;
+	cursor->last = &cursor->first;
+}
+
+int callchain_cursor_append(struct callchain_cursor *cursor, u64 ip,
+			    struct map *map, struct symbol *sym);
+
+/* Close a cursor writing session. Initialize for the reader */
+static inline void callchain_cursor_commit(struct callchain_cursor *cursor)
+{
+	cursor->curr = cursor->first;
+	cursor->pos = 0;
+}
+
+/* Cursor reading iteration helpers */
+static inline struct callchain_cursor_node *
+callchain_cursor_current(struct callchain_cursor *cursor)
+{
+	if (cursor->pos == cursor->nr)
+		return NULL;
+
+	return cursor->curr;
+}
+
+static inline void callchain_cursor_advance(struct callchain_cursor *cursor)
+{
+	cursor->curr = cursor->curr->next;
+	cursor->pos++;
+}
 #endif	/* __PERF_CALLCHAIN_H */
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 32f4f1f2f6e4..a438a0652d23 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -211,7 +211,9 @@ void hist_entry__free(struct hist_entry *he)
  * collapse the histogram
  */
 
-static bool collapse__insert_entry(struct rb_root *root, struct hist_entry *he)
+static bool hists__collapse_insert_entry(struct hists *self,
+					 struct rb_root *root,
+					 struct hist_entry *he)
 {
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
@@ -226,8 +228,11 @@ static bool collapse__insert_entry(struct rb_root *root, struct hist_entry *he)
 
 		if (!cmp) {
 			iter->period += he->period;
-			if (symbol_conf.use_callchain)
-				callchain_merge(iter->callchain, he->callchain);
+			if (symbol_conf.use_callchain) {
+				callchain_cursor_reset(&self->callchain_cursor);
+				callchain_merge(&self->callchain_cursor, iter->callchain,
+						he->callchain);
+			}
 			hist_entry__free(he);
 			return false;
 		}
@@ -262,7 +267,7 @@ void hists__collapse_resort(struct hists *self)
 		next = rb_next(&n->rb_node);
 
 		rb_erase(&n->rb_node, &self->entries);
-		if (collapse__insert_entry(&tmp, n))
+		if (hists__collapse_insert_entry(self, &tmp, n))
 			hists__inc_nr_entries(self, n);
 	}
 
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index ee789856a8c9..889559b86492 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -77,6 +77,8 @@ struct hists {
 	u64			event_stream;
 	u32			type;
 	u16			col_len[HISTC_NR_COLS];
+	/* Best would be to reuse the session callchain cursor */
+	struct callchain_cursor	callchain_cursor;
 };
 
 struct hist_entry *__hists__add_entry(struct hists *self,
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 105f00bfd555..b58a48a5e5a9 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -242,17 +242,16 @@ static bool symbol__match_parent_regex(struct symbol *sym)
 	return 0;
 }
 
-struct map_symbol *perf_session__resolve_callchain(struct perf_session *self,
-						   struct thread *thread,
-						   struct ip_callchain *chain,
-						   struct symbol **parent)
+int perf_session__resolve_callchain(struct perf_session *self,
+				    struct thread *thread,
+				    struct ip_callchain *chain,
+				    struct symbol **parent)
 {
 	u8 cpumode = PERF_RECORD_MISC_USER;
 	unsigned int i;
-	struct map_symbol *syms = calloc(chain->nr, sizeof(*syms));
+	int err;
 
-	if (!syms)
-		return NULL;
+	callchain_cursor_reset(&self->callchain_cursor);
 
 	for (i = 0; i < chain->nr; i++) {
 		u64 ip = chain->ips[i];
@@ -281,12 +280,15 @@ struct map_symbol *perf_session__resolve_callchain(struct perf_session *self,
 				*parent = al.sym;
 			if (!symbol_conf.use_callchain)
 				break;
-			syms[i].map = al.map;
-			syms[i].sym = al.sym;
 		}
+
+		err = callchain_cursor_append(&self->callchain_cursor,
+					      ip, al.map, al.sym);
+		if (err)
+			return err;
 	}
 
-	return syms;
+	return 0;
 }
 
 static int process_event_synth_stub(event_t *event __used,
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index decd83f274fd..e815468eb888 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -51,7 +51,8 @@ struct perf_session {
 	int			cwdlen;
 	char			*cwd;
 	struct ordered_samples	ordered_samples;
-	char filename[0];
+	struct callchain_cursor	callchain_cursor;
+	char			filename[0];
 };
 
 struct perf_event_ops;
@@ -94,10 +95,10 @@ int __perf_session__process_events(struct perf_session *self,
 int perf_session__process_events(struct perf_session *self,
 				 struct perf_event_ops *event_ops);
 
-struct map_symbol *perf_session__resolve_callchain(struct perf_session *self,
-						   struct thread *thread,
-						   struct ip_callchain *chain,
-						   struct symbol **parent);
+int perf_session__resolve_callchain(struct perf_session *self,
+				    struct thread *thread,
+				    struct ip_callchain *chain,
+				    struct symbol **parent);
 
 bool perf_session__has_traces(struct perf_session *self, const char *msg);
 
-- 
cgit v1.2.3


From f08c3154ac439c4b5762a40107d84e839e08fbc5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 14 Jan 2011 04:51:59 +0100
Subject: perf callchain: Rename cumul_hits into callchain_cumul_hits

That makes the callchain API naming more consistent and
reduce potential naming clashes.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1294977121-5700-3-git-send-email-fweisbec@gmail.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/callchain.c         | 10 +++++-----
 tools/perf/util/callchain.h         |  2 +-
 tools/perf/util/hist.c              |  2 +-
 tools/perf/util/ui/browsers/hists.c |  2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 53a49e0cfc6c..4c6360fc2d5b 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -38,14 +38,14 @@ rb_insert_callchain(struct rb_root *root, struct callchain_node *chain,
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
 	struct callchain_node *rnode;
-	u64 chain_cumul = cumul_hits(chain);
+	u64 chain_cumul = callchain_cumul_hits(chain);
 
 	while (*p) {
 		u64 rnode_cumul;
 
 		parent = *p;
 		rnode = rb_entry(parent, struct callchain_node, rb_node);
-		rnode_cumul = cumul_hits(rnode);
+		rnode_cumul = callchain_cumul_hits(rnode);
 
 		switch (mode) {
 		case CHAIN_FLAT:
@@ -104,7 +104,7 @@ static void __sort_chain_graph_abs(struct callchain_node *node,
 
 	chain_for_each_child(child, node) {
 		__sort_chain_graph_abs(child, min_hit);
-		if (cumul_hits(child) >= min_hit)
+		if (callchain_cumul_hits(child) >= min_hit)
 			rb_insert_callchain(&node->rb_root, child,
 					    CHAIN_GRAPH_ABS);
 	}
@@ -129,7 +129,7 @@ static void __sort_chain_graph_rel(struct callchain_node *node,
 
 	chain_for_each_child(child, node) {
 		__sort_chain_graph_rel(child, min_percent);
-		if (cumul_hits(child) >= min_hit)
+		if (callchain_cumul_hits(child) >= min_hit)
 			rb_insert_callchain(&node->rb_root, child,
 					    CHAIN_GRAPH_REL);
 	}
@@ -270,7 +270,7 @@ split_add_child(struct callchain_node *parent,
 	/* split the hits */
 	new->hit = parent->hit;
 	new->children_hit = parent->children_hit;
-	parent->children_hit = cumul_hits(new);
+	parent->children_hit = callchain_cumul_hits(new);
 	new->val_nr = parent->val_nr - idx_local;
 	parent->val_nr = idx_local;
 
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index d74a19af4a44..07f71e3e0a71 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -82,7 +82,7 @@ static inline void callchain_init(struct callchain_root *root)
 	root->max_depth = 0;
 }
 
-static inline u64 cumul_hits(struct callchain_node *node)
+static inline u64 callchain_cumul_hits(struct callchain_node *node)
 {
 	return node->hit + node->children_hit;
 }
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index a438a0652d23..02ed318d7312 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -430,7 +430,7 @@ static size_t __callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
 		u64 cumul;
 
 		child = rb_entry(node, struct callchain_node, rb_node);
-		cumul = cumul_hits(child);
+		cumul = callchain_cumul_hits(child);
 		remaining -= cumul;
 
 		/*
diff --git a/tools/perf/util/ui/browsers/hists.c b/tools/perf/util/ui/browsers/hists.c
index 60c463c16028..86428239fa65 100644
--- a/tools/perf/util/ui/browsers/hists.c
+++ b/tools/perf/util/ui/browsers/hists.c
@@ -377,7 +377,7 @@ static int hist_browser__show_callchain_node_rb_tree(struct hist_browser *self,
 	while (node) {
 		struct callchain_node *child = rb_entry(node, struct callchain_node, rb_node);
 		struct rb_node *next = rb_next(node);
-		u64 cumul = cumul_hits(child);
+		u64 cumul = callchain_cumul_hits(child);
 		struct callchain_list *chain;
 		char folded_sign = ' ';
 		int first = true;
-- 
cgit v1.2.3


From 16537f1355017a285b904bfb6bf767464293e69c Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 14 Jan 2011 04:52:00 +0100
Subject: perf callchain: Rename register_callchain_param into
 callchain_register_param

To make the callchain API naming more consistent.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1294977121-5700-4-git-send-email-fweisbec@gmail.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-report.c | 4 ++--
 tools/perf/util/callchain.c | 2 +-
 tools/perf/util/callchain.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index c95599a82f9e..f6a43493d1d0 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -221,7 +221,7 @@ static int perf_session__setup_sample_type(struct perf_session *self)
 	} else if (!dont_use_callchains && callchain_param.mode != CHAIN_NONE &&
 		   !symbol_conf.use_callchain) {
 			symbol_conf.use_callchain = true;
-			if (register_callchain_param(&callchain_param) < 0) {
+			if (callchain_register_param(&callchain_param) < 0) {
 				fprintf(stderr, "Can't register callchain"
 						" params\n");
 				return -EINVAL;
@@ -423,7 +423,7 @@ parse_callchain_opt(const struct option *opt __used, const char *arg,
 	if (tok2)
 		callchain_param.print_limit = strtod(tok2, &endptr);
 setup:
-	if (register_callchain_param(&callchain_param) < 0) {
+	if (callchain_register_param(&callchain_param) < 0) {
 		fprintf(stderr, "Can't register callchain params\n");
 		return -1;
 	}
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 4c6360fc2d5b..5b3f09dd137d 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -143,7 +143,7 @@ sort_chain_graph_rel(struct rb_root *rb_root, struct callchain_root *chain_root,
 	rb_root->rb_node = chain_root->node.rb_root.rb_node;
 }
 
-int register_callchain_param(struct callchain_param *param)
+int callchain_register_param(struct callchain_param *param)
 {
 	switch (param->mode) {
 	case CHAIN_GRAPH_ABS:
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 07f71e3e0a71..2bb5403010d2 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -87,7 +87,7 @@ static inline u64 callchain_cumul_hits(struct callchain_node *node)
 	return node->hit + node->children_hit;
 }
 
-int register_callchain_param(struct callchain_param *param);
+int callchain_register_param(struct callchain_param *param);
 int callchain_append(struct callchain_root *root,
 		     struct callchain_cursor *cursor,
 		     u64 period);
-- 
cgit v1.2.3


From 529363b76929beb85b81439c61063130af046a21 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 14 Jan 2011 04:52:01 +0100
Subject: perf callchain: Don't give arbitrary gender to callchain tree nodes

Some little callchain tree nodes shyly asked me if they can have
sisters.

How cute!

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1294977121-5700-5-git-send-email-fweisbec@gmail.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/callchain.c | 8 ++++----
 tools/perf/util/callchain.h | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 5b3f09dd137d..f8c66d1435e0 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -26,10 +26,10 @@ bool ip_callchain__valid(struct ip_callchain *chain, const event_t *event)
 }
 
 #define chain_for_each_child(child, parent)	\
-	list_for_each_entry(child, &parent->children, brothers)
+	list_for_each_entry(child, &parent->children, siblings)
 
 #define chain_for_each_child_safe(child, next, parent)	\
-	list_for_each_entry_safe(child, next, &parent->children, brothers)
+	list_for_each_entry_safe(child, next, &parent->children, siblings)
 
 static void
 rb_insert_callchain(struct rb_root *root, struct callchain_node *chain,
@@ -189,7 +189,7 @@ create_child(struct callchain_node *parent, bool inherit_children)
 		chain_for_each_child(next, new)
 			next->parent = new;
 	}
-	list_add_tail(&new->brothers, &parent->children);
+	list_add_tail(&new->siblings, &parent->children);
 
 	return new;
 }
@@ -419,7 +419,7 @@ merge_chain_branch(struct callchain_cursor *cursor,
 		if (err)
 			break;
 
-		list_del(&child->brothers);
+		list_del(&child->siblings);
 		free(child);
 	}
 
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 2bb5403010d2..67137256a1cd 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -16,7 +16,7 @@ enum chain_mode {
 
 struct callchain_node {
 	struct callchain_node	*parent;
-	struct list_head	brothers;
+	struct list_head	siblings;
 	struct list_head	children;
 	struct list_head	val;
 	struct rb_node		rb_node; /* to sort nodes in an rbtree */
@@ -72,7 +72,7 @@ struct callchain_cursor {
 
 static inline void callchain_init(struct callchain_root *root)
 {
-	INIT_LIST_HEAD(&root->node.brothers);
+	INIT_LIST_HEAD(&root->node.siblings);
 	INIT_LIST_HEAD(&root->node.children);
 	INIT_LIST_HEAD(&root->node.val);
 
-- 
cgit v1.2.3


From b0e8572f3b29c0760b66ba5627a6d5426c88c97d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sun, 16 Jan 2011 17:39:15 -0200
Subject: perf top: Add native_safe_halt to skip symbols

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-top.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 58352ad807c7..31fbaf38d9c1 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -933,6 +933,7 @@ repeat:
 /* Tag samples to be skipped. */
 static const char *skip_symbols[] = {
 	"default_idle",
+	"native_safe_halt",
 	"cpu_idle",
 	"enter_idle",
 	"exit_idle",
-- 
cgit v1.2.3


From 4cc9cec636e7f78aba7f17606ac13cac07ea5787 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 13 Jan 2011 21:45:58 +0900
Subject: perf probe: Introduce lines walker interface

Introduce die_walk_lines() for walking on the line list of given die, and use
it in line_range finder and probe point finder.

Cc: 2nddept-manager@sdl.hitachi.co.jp
Cc: Franck Bui-Huu <fbuihuu@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20110113124558.22426.48170.stgit@ltc236.sdl.hitachi.co.jp>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
[ committer note: s/%ld/%zd/ for a size_t nlines var that broke f14 x86 build]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/probe-finder.c | 321 +++++++++++++++++++++--------------------
 1 file changed, 167 insertions(+), 154 deletions(-)

diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index ab83b6ac5d65..508c017f566a 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -458,6 +458,124 @@ static Dwarf_Die *die_find_inlinefunc(Dwarf_Die *sp_die, Dwarf_Addr addr,
 	return die_find_child(sp_die, __die_find_inline_cb, &addr, die_mem);
 }
 
+/* Walker on lines (Note: line number will not be sorted) */
+typedef int (* line_walk_handler_t) (const char *fname, int lineno,
+				     Dwarf_Addr addr, void *data);
+
+struct __line_walk_param {
+	line_walk_handler_t handler;
+	void *data;
+	int retval;
+};
+
+/* Walk on decl lines in given DIE */
+static int __die_walk_funclines(Dwarf_Die *sp_die,
+				line_walk_handler_t handler, void *data)
+{
+	const char *fname;
+	Dwarf_Addr addr;
+	int lineno, ret = 0;
+
+	/* Handle function declaration line */
+	fname = dwarf_decl_file(sp_die);
+	if (fname && dwarf_decl_line(sp_die, &lineno) == 0 &&
+	    dwarf_entrypc(sp_die, &addr) == 0) {
+		ret = handler(fname, lineno, addr, data);
+	}
+
+	return ret;
+}
+
+static int __die_walk_culines_cb(Dwarf_Die *sp_die, void *data)
+{
+	struct __line_walk_param *lw = data;
+
+	lw->retval = __die_walk_funclines(sp_die, lw->handler, lw->data);
+	if (lw->retval != 0)
+		return DWARF_CB_ABORT;
+
+	return DWARF_CB_OK;
+}
+
+/*
+ * Walk on lines inside given PDIE. If the PDIE is subprogram, walk only on
+ * the lines inside the subprogram, otherwise PDIE must be a CU DIE.
+ */
+static int die_walk_lines(Dwarf_Die *pdie, line_walk_handler_t handler,
+			  void *data)
+{
+	Dwarf_Lines *lines;
+	Dwarf_Line *line;
+	Dwarf_Addr addr;
+	const char *fname;
+	int lineno, ret = 0;
+	Dwarf_Die die_mem, *cu_die;
+	size_t nlines, i;
+
+	/* Get the CU die */
+	if (dwarf_tag(pdie) == DW_TAG_subprogram)
+		cu_die = dwarf_diecu(pdie, &die_mem, NULL, NULL);
+	else
+		cu_die = pdie;
+	if (!cu_die) {
+		pr_debug2("Failed to get CU from subprogram\n");
+		return -EINVAL;
+	}
+
+	/* Get lines list in the CU */
+	if (dwarf_getsrclines(cu_die, &lines, &nlines) != 0) {
+		pr_debug2("Failed to get source lines on this CU.\n");
+		return -ENOENT;
+	}
+	pr_debug2("Get %zd lines from this CU\n", nlines);
+
+	/* Walk on the lines on lines list */
+	for (i = 0; i < nlines; i++) {
+		line = dwarf_onesrcline(lines, i);
+		if (line == NULL ||
+		    dwarf_lineno(line, &lineno) != 0 ||
+		    dwarf_lineaddr(line, &addr) != 0) {
+			pr_debug2("Failed to get line info. "
+				  "Possible error in debuginfo.\n");
+			continue;
+		}
+		/* Filter lines based on address */
+		if (pdie != cu_die)
+			/*
+			 * Address filtering
+			 * The line is included in given function, and
+			 * no inline block includes it.
+			 */
+			if (!dwarf_haspc(pdie, addr) ||
+			    die_find_inlinefunc(pdie, addr, &die_mem))
+				continue;
+		/* Get source line */
+		fname = dwarf_linesrc(line, NULL, NULL);
+
+		ret = handler(fname, lineno, addr, data);
+		if (ret != 0)
+			return ret;
+	}
+
+	/*
+	 * Dwarf lines doesn't include function declarations and inlined
+	 * subroutines. We have to check functions list or given function.
+	 */
+	if (pdie != cu_die)
+		ret = __die_walk_funclines(pdie, handler, data);
+	else {
+		struct __line_walk_param param = {
+			.handler = handler,
+			.data = data,
+			.retval = 0,
+		};
+		dwarf_getfuncs(cu_die, __die_walk_culines_cb, &param, 0);
+		ret = param.retval;
+	}
+
+	return ret;
+}
+
 struct __find_variable_param {
 	const char *name;
 	Dwarf_Addr addr;
@@ -1050,43 +1168,26 @@ static int call_probe_finder(Dwarf_Die *sp_die, struct probe_finder *pf)
 	return ret;
 }
 
-/* Find probe point from its line number */
-static int find_probe_point_by_line(struct probe_finder *pf)
+static int probe_point_line_walker(const char *fname, int lineno,
+				   Dwarf_Addr addr, void *data)
 {
-	Dwarf_Lines *lines;
-	Dwarf_Line *line;
-	size_t nlines, i;
-	Dwarf_Addr addr;
-	int lineno;
-	int ret = 0;
+	struct probe_finder *pf = data;
+	int ret;
 
-	if (dwarf_getsrclines(&pf->cu_die, &lines, &nlines) != 0) {
-		pr_warning("No source lines found.\n");
-		return -ENOENT;
-	}
+	if (lineno != pf->lno || strtailcmp(fname, pf->fname) != 0)
+		return 0;
 
-	for (i = 0; i < nlines && ret == 0; i++) {
-		line = dwarf_onesrcline(lines, i);
-		if (dwarf_lineno(line, &lineno) != 0 ||
-		    lineno != pf->lno)
-			continue;
+	pf->addr = addr;
+	ret = call_probe_finder(NULL, pf);
 
-		/* TODO: Get fileno from line, but how? */
-		if (strtailcmp(dwarf_linesrc(line, NULL, NULL), pf->fname) != 0)
-			continue;
-
-		if (dwarf_lineaddr(line, &addr) != 0) {
-			pr_warning("Failed to get the address of the line.\n");
-			return -ENOENT;
-		}
-		pr_debug("Probe line found: line[%d]:%d addr:0x%jx\n",
-			 (int)i, lineno, (uintmax_t)addr);
-		pf->addr = addr;
+	/* Continue if no error, because the line will be in inline function */
+	return ret < 0 ?: 0;
+}
 
-		ret = call_probe_finder(NULL, pf);
-		/* Continuing, because target line might be inlined. */
-	}
-	return ret;
+/* Find probe point from its line number */
+static int find_probe_point_by_line(struct probe_finder *pf)
+{
+	return die_walk_lines(&pf->cu_die, probe_point_line_walker, pf);
 }
 
 /* Find lines which match lazy pattern */
@@ -1140,15 +1241,31 @@ out_close:
 	return nlines;
 }
 
+static int probe_point_lazy_walker(const char *fname, int lineno,
+				   Dwarf_Addr addr, void *data)
+{
+	struct probe_finder *pf = data;
+	int ret;
+
+	if (!line_list__has_line(&pf->lcache, lineno) ||
+	    strtailcmp(fname, pf->fname) != 0)
+		return 0;
+
+	pr_debug("Probe line found: line:%d addr:0x%llx\n",
+		 lineno, (unsigned long long)addr);
+	pf->addr = addr;
+	ret = call_probe_finder(NULL, pf);
+
+	/*
+	 * Continue if no error, because the lazy pattern will match
+	 * to other lines
+	 */
+	return ret < 0 ?: 0;
+}
+
 /* Find probe points from lazy pattern  */
 static int find_probe_point_lazy(Dwarf_Die *sp_die, struct probe_finder *pf)
 {
-	Dwarf_Lines *lines;
-	Dwarf_Line *line;
-	size_t nlines, i;
-	Dwarf_Addr addr;
-	Dwarf_Die die_mem;
-	int lineno;
 	int ret = 0;
 
 	if (list_empty(&pf->lcache)) {
@@ -1162,45 +1279,7 @@ static int find_probe_point_lazy(Dwarf_Die *sp_die, struct probe_finder *pf)
 			return ret;
 	}
 
-	if (dwarf_getsrclines(&pf->cu_die, &lines, &nlines) != 0) {
-		pr_warning("No source lines found.\n");
-		return -ENOENT;
-	}
-
-	for (i = 0; i < nlines && ret >= 0; i++) {
-		line = dwarf_onesrcline(lines, i);
-
-		if (dwarf_lineno(line, &lineno) != 0 ||
-		    !line_list__has_line(&pf->lcache, lineno))
-			continue;
-
-		/* TODO: Get fileno from line, but how? */
-		if (strtailcmp(dwarf_linesrc(line, NULL, NULL), pf->fname) != 0)
-			continue;
-
-		if (dwarf_lineaddr(line, &addr) != 0) {
-			pr_debug("Failed to get the address of line %d.\n",
-				 lineno);
-			continue;
-		}
-		if (sp_die) {
-			/* Address filtering 1: does sp_die include addr? */
-			if (!dwarf_haspc(sp_die, addr))
-				continue;
-			/* Address filtering 2: No child include addr? */
-			if (die_find_inlinefunc(sp_die, addr, &die_mem))
-				continue;
-		}
-
-		pr_debug("Probe line found: line[%d]:%d addr:0x%llx\n",
-			 (int)i, lineno, (unsigned long long)addr);
-		pf->addr = addr;
-
-		ret = call_probe_finder(sp_die, pf);
-		/* Continuing, because target line might be inlined. */
-	}
-	/* TODO: deallocate lines, but how? */
-	return ret;
+	return die_walk_lines(sp_die, probe_point_lazy_walker, pf);
 }
 
 /* Callback parameter with return value */
@@ -1644,91 +1723,28 @@ static int line_range_add_line(const char *src, unsigned int lineno,
 	return line_list__add_line(&lr->line_list, lineno);
 }
 
-/* Search function declaration lines */
-static int line_range_funcdecl_cb(Dwarf_Die *sp_die, void *data)
+static int line_range_walk_cb(const char *fname, int lineno,
+			      Dwarf_Addr addr __used,
+			      void *data)
 {
-	struct dwarf_callback_param *param = data;
-	struct line_finder *lf = param->data;
-	const char *src;
-	int lineno;
-
-	src = dwarf_decl_file(sp_die);
-	if (src && strtailcmp(src, lf->fname) != 0)
-		return DWARF_CB_OK;
+	struct line_finder *lf = data;
 
-	if (dwarf_decl_line(sp_die, &lineno) != 0 ||
+	if ((strtailcmp(fname, lf->fname) != 0) ||
 	    (lf->lno_s > lineno || lf->lno_e < lineno))
-		return DWARF_CB_OK;
+		return 0;
 
-	param->retval = line_range_add_line(src, lineno, lf->lr);
-	if (param->retval < 0)
-		return DWARF_CB_ABORT;
-	return DWARF_CB_OK;
-}
+	if (line_range_add_line(fname, lineno, lf->lr) < 0)
+		return -EINVAL;
 
-static int find_line_range_func_decl_lines(struct line_finder *lf)
-{
-	struct dwarf_callback_param param = {.data = (void *)lf, .retval = 0};
-	dwarf_getfuncs(&lf->cu_die, line_range_funcdecl_cb, &param, 0);
-	return param.retval;
+	return 0;
 }
 
 /* Find line range from its line number */
 static int find_line_range_by_line(Dwarf_Die *sp_die, struct line_finder *lf)
 {
-	Dwarf_Lines *lines;
-	Dwarf_Line *line;
-	size_t nlines, i;
-	Dwarf_Addr addr;
-	int lineno, ret = 0;
-	const char *src;
-	Dwarf_Die die_mem;
-
-	line_list__init(&lf->lr->line_list);
-	if (dwarf_getsrclines(&lf->cu_die, &lines, &nlines) != 0) {
-		pr_warning("No source lines found.\n");
-		return -ENOENT;
-	}
-
-	/* Search probable lines on lines list */
-	for (i = 0; i < nlines; i++) {
-		line = dwarf_onesrcline(lines, i);
-		if (dwarf_lineno(line, &lineno) != 0 ||
-		    (lf->lno_s > lineno || lf->lno_e < lineno))
-			continue;
-
-		if (sp_die) {
-			/* Address filtering 1: does sp_die include addr? */
-			if (dwarf_lineaddr(line, &addr) != 0 ||
-			    !dwarf_haspc(sp_die, addr))
-				continue;
-
-			/* Address filtering 2: No child include addr? */
-			if (die_find_inlinefunc(sp_die, addr, &die_mem))
-				continue;
-		}
-
-		/* TODO: Get fileno from line, but how? */
-		src = dwarf_linesrc(line, NULL, NULL);
-		if (strtailcmp(src, lf->fname) != 0)
-			continue;
-
-		ret = line_range_add_line(src, lineno, lf->lr);
-		if (ret < 0)
-			return ret;
-	}
+	int ret;
 
-	/*
-	 * Dwarf lines doesn't include function declarations. We have to
-	 * check functions list or given function.
-	 */
-	if (sp_die) {
-		src = dwarf_decl_file(sp_die);
-		if (src && dwarf_decl_line(sp_die, &lineno) == 0 &&
-		    (lf->lno_s <= lineno && lf->lno_e >= lineno))
-			ret = line_range_add_line(src, lineno, lf->lr);
-	} else
-		ret = find_line_range_func_decl_lines(lf);
+	ret = die_walk_lines(sp_die ?: &lf->cu_die, line_range_walk_cb, lf);
 
 	/* Update status */
 	if (ret >= 0)
@@ -1758,9 +1774,6 @@ static int line_range_search_cb(Dwarf_Die *sp_die, void *data)
 	struct line_finder *lf = param->data;
 	struct line_range *lr = lf->lr;
 
-	pr_debug("find (%llx) %s\n",
-		 (unsigned long long)dwarf_dieoffset(sp_die),
-		 dwarf_diename(sp_die));
 	if (dwarf_tag(sp_die) == DW_TAG_subprogram &&
 	    die_compare_name(sp_die, lr->function)) {
 		lf->fname = dwarf_decl_file(sp_die);
-- 
cgit v1.2.3


From 5069ed86be3c2f28bcdf7fae1374ec0c325aafba Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 13 Jan 2011 21:46:05 +0900
Subject: perf probe: Enable to put probe inline function call site

Enable to put probe inline function call site. This will increase line-based
probe-ability.

<Without this patch>
$ ./perf probe -L schedule:48
<schedule:48>
                pre_schedule(rq, prev);

     50         if (unlikely(!rq->nr_running))
                        idle_balance(cpu, rq);

                put_prev_task(rq, prev);
                next = pick_next_task(rq);

     56         if (likely(prev != next)) {
                        sched_info_switch(prev, next);
                        trace_sched_switch_out(prev, next);
                        perf_event_task_sched_out(prev, next);

<With this patch>
$ ./perf probe -L schedule:48
<schedule:48>
     48         pre_schedule(rq, prev);

     50         if (unlikely(!rq->nr_running))
     51                 idle_balance(cpu, rq);

     53         put_prev_task(rq, prev);
     54         next = pick_next_task(rq);

     56         if (likely(prev != next)) {
     57                 sched_info_switch(prev, next);
     58                 trace_sched_switch_out(prev, next);
     59                 perf_event_task_sched_out(prev, next);

Cc: 2nddept-manager@sdl.hitachi.co.jp
Cc: Franck Bui-Huu <fbuihuu@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20110113124604.22426.48873.stgit@ltc236.sdl.hitachi.co.jp>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/probe-finder.c | 56 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 48 insertions(+), 8 deletions(-)

diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 508c017f566a..69215bff17e9 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -280,6 +280,19 @@ static bool die_compare_name(Dwarf_Die *dw_die, const char *tname)
 	return name ? (strcmp(tname, name) == 0) : false;
 }
 
+/* Get callsite line number of inline-function instance */
+static int die_get_call_lineno(Dwarf_Die *in_die)
+{
+	Dwarf_Attribute attr;
+	Dwarf_Word ret;
+
+	if (!dwarf_attr(in_die, DW_AT_call_line, &attr))
+		return -ENOENT;
+
+	dwarf_formudata(&attr, &ret);
+	return (int)ret;
+}
+
 /* Get type die */
 static Dwarf_Die *die_get_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem)
 {
@@ -463,27 +476,54 @@ typedef int (* line_walk_handler_t) (const char *fname, int lineno,
 				     Dwarf_Addr addr, void *data);
 
 struct __line_walk_param {
+	const char *fname;
 	line_walk_handler_t handler;
 	void *data;
 	int retval;
 };
 
-/* Walk on decl lines in given DIE */
+static int __die_walk_funclines_cb(Dwarf_Die *in_die, void *data)
+{
+	struct __line_walk_param *lw = data;
+	Dwarf_Addr addr;
+	int lineno;
+
+	if (dwarf_tag(in_die) == DW_TAG_inlined_subroutine) {
+		lineno = die_get_call_lineno(in_die);
+		if (lineno > 0 && dwarf_entrypc(in_die, &addr) == 0) {
+			lw->retval = lw->handler(lw->fname, lineno, addr,
+						 lw->data);
+			if (lw->retval != 0)
+				return DIE_FIND_CB_FOUND;
+		}
+	}
+	return DIE_FIND_CB_SIBLING;
+}
+
+/* Walk on lines of blocks included in given DIE */
 static int __die_walk_funclines(Dwarf_Die *sp_die,
 				line_walk_handler_t handler, void *data)
 {
-	const char *fname;
+	struct __line_walk_param lw = {
+		.handler = handler,
+		.data = data,
+		.retval = 0,
+	};
+	Dwarf_Die die_mem;
 	Dwarf_Addr addr;
-	int lineno, ret = 0;
+	int lineno;
 
 	/* Handle function declaration line */
-	fname = dwarf_decl_file(sp_die);
-	if (fname && dwarf_decl_line(sp_die, &lineno) == 0 &&
+	lw.fname = dwarf_decl_file(sp_die);
+	if (lw.fname && dwarf_decl_line(sp_die, &lineno) == 0 &&
 	    dwarf_entrypc(sp_die, &addr) == 0) {
-		ret = handler(fname, lineno, addr, data);
+		lw.retval = handler(lw.fname, lineno, addr, data);
+		if (lw.retval != 0)
+			goto done;
 	}
-
-	return ret;
+	die_find_child(sp_die, __die_walk_funclines_cb, &lw, &die_mem);
+done:
+	return lw.retval;
 }
 
 static int __die_walk_culines_cb(Dwarf_Die *sp_die, void *data)
-- 
cgit v1.2.3


From e80711ca8512c8586da0c3e18e2f1caf73c88731 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 13 Jan 2011 21:46:11 +0900
Subject: perf probe: Add --funcs to show available functions in symtab

Add --funcs to show available functions in symtab.

Originally this feature came from Srikar's uprobes patches
( http://lkml.org/lkml/2010/8/27/244 )

e.g.
...
__ablkcipher_walk_complete
__absent_pages_in_range
__account_scheduler_latency
__add_pages
__alloc_pages_nodemask
__alloc_percpu
__alloc_reserved_percpu
__alloc_skb
__alloc_workqueue_key
__any_online_cpu
__ata_ehi_push_desc
...

This also supports symbols in module, e.g.

...
cleanup_module
cpuid_maxphyaddr
emulate_clts
emulate_instruction
emulate_int_real
emulate_invlpg
emulator_get_dr
emulator_set_dr
emulator_task_switch
emulator_write_emulated
emulator_write_phys
fx_init
...

Original-patch-from: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: 2nddept-manager@sdl.hitachi.co.jp
Cc: Franck Bui-Huu <fbuihuu@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20110113124611.22426.10835.stgit@ltc236.sdl.hitachi.co.jp>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
[ committer note: Add missing elf.h for STB_GLOBAL that broke a RHEL4 build ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-probe.txt |  4 ++
 tools/perf/builtin-probe.c              | 29 +++++++++++++-
 tools/perf/util/probe-event.c           | 68 ++++++++++++++++++++++++++++++++-
 tools/perf/util/probe-event.h           |  1 +
 4 files changed, 99 insertions(+), 3 deletions(-)

diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt
index 86b797a35aa6..fcc51fe0195c 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -73,6 +73,10 @@ OPTIONS
 	(Only for --vars) Show external defined variables in addition to local
 	variables.
 
+-F::
+--funcs::
+	Show available functions in given module or kernel.
+
 -f::
 --force::
 	Forcibly add events with existing name.
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index add163c9f0e7..6cf708aba7c9 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -52,6 +52,7 @@ static struct {
 	bool show_lines;
 	bool show_vars;
 	bool show_ext_vars;
+	bool show_funcs;
 	bool mod_events;
 	int nevents;
 	struct perf_probe_event events[MAX_PROBES];
@@ -221,6 +222,8 @@ static const struct option options[] = {
 	OPT__DRY_RUN(&probe_event_dry_run),
 	OPT_INTEGER('\0', "max-probes", &params.max_probe_points,
 		 "Set how many probe points can be found for a probe."),
+	OPT_BOOLEAN('F', "funcs", &params.show_funcs,
+		    "Show potential probe-able functions."),
 	OPT_END()
 };
 
@@ -246,7 +249,7 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
 		params.max_probe_points = MAX_PROBES;
 
 	if ((!params.nevents && !params.dellist && !params.list_events &&
-	     !params.show_lines))
+	     !params.show_lines && !params.show_funcs))
 		usage_with_options(probe_usage, options);
 
 	/*
@@ -267,12 +270,36 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
 			pr_err(" Error: Don't use --list with --vars.\n");
 			usage_with_options(probe_usage, options);
 		}
+		if (params.show_funcs) {
+			pr_err("  Error: Don't use --list with --funcs.\n");
+			usage_with_options(probe_usage, options);
+		}
 		ret = show_perf_probe_events();
 		if (ret < 0)
 			pr_err("  Error: Failed to show event list. (%d)\n",
 			       ret);
 		return ret;
 	}
+	if (params.show_funcs) {
+		if (params.nevents != 0 || params.dellist) {
+			pr_err("  Error: Don't use --funcs with"
+			       " --add/--del.\n");
+			usage_with_options(probe_usage, options);
+		}
+		if (params.show_lines) {
+			pr_err("  Error: Don't use --funcs with --line.\n");
+			usage_with_options(probe_usage, options);
+		}
+		if (params.show_vars) {
+			pr_err("  Error: Don't use --funcs with --vars.\n");
+			usage_with_options(probe_usage, options);
+		}
+		ret = show_available_funcs(params.target_module);
+		if (ret < 0)
+			pr_err("  Error: Failed to show functions."
+			       " (%d)\n", ret);
+		return ret;
+	}
 
 #ifdef DWARF_SUPPORT
 	if (params.show_lines) {
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 6e29d9c9dccc..859d377a3df3 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -31,6 +31,7 @@
 #include <string.h>
 #include <stdarg.h>
 #include <limits.h>
+#include <elf.h>
 
 #undef _GNU_SOURCE
 #include "util.h"
@@ -111,7 +112,25 @@ static struct symbol *__find_kernel_function_by_name(const char *name,
 						     NULL);
 }
 
-const char *kernel_get_module_path(const char *module)
+static struct map *kernel_get_module_map(const char *module)
+{
+	struct rb_node *nd;
+	struct map_groups *grp = &machine.kmaps;
+
+	if (!module)
+		module = "kernel";
+
+	for (nd = rb_first(&grp->maps[MAP__FUNCTION]); nd; nd = rb_next(nd)) {
+		struct map *pos = rb_entry(nd, struct map, rb_node);
+		if (strncmp(pos->dso->short_name + 1, module,
+			    pos->dso->short_name_len - 2) == 0) {
+			return pos;
+		}
+	}
+	return NULL;
+}
+
+static struct dso *kernel_get_module_dso(const char *module)
 {
 	struct dso *dso;
 	struct map *map;
@@ -141,7 +160,13 @@ const char *kernel_get_module_path(const char *module)
 		}
 	}
 found:
-	return dso->long_name;
+	return dso;
+}
+
+const char *kernel_get_module_path(const char *module)
+{
+	struct dso *dso = kernel_get_module_dso(module);
+	return (dso) ? dso->long_name : NULL;
 }
 
 #ifdef DWARF_SUPPORT
@@ -1913,3 +1938,42 @@ int del_perf_probe_events(struct strlist *dellist)
 	return ret;
 }
 
+/*
+ * If a symbol corresponds to a function with global binding return 0.
+ * For all others return 1.
+ */
+static int filter_non_global_functions(struct map *map __unused,
+					struct symbol *sym)
+{
+	if (sym->binding != STB_GLOBAL)
+		return 1;
+
+	return 0;
+}
+
+int show_available_funcs(const char *module)
+{
+	struct map *map;
+	int ret;
+
+	setup_pager();
+
+	ret = init_vmlinux();
+	if (ret < 0)
+		return ret;
+
+	map = kernel_get_module_map(module);
+	if (!map) {
+		pr_err("Failed to find %s map.\n", (module) ? : "kernel");
+		return -EINVAL;
+	}
+	if (map__load(map, filter_non_global_functions)) {
+		pr_err("Failed to load map.\n");
+		return -EINVAL;
+	}
+	if (!dso__sorted_by_name(map->dso, map->type))
+		dso__sort_by_name(map->dso, map->type);
+
+	dso__fprintf_symbols_by_name(map->dso, map->type, stdout);
+	return 0;
+}
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index 5accbedfea37..1fb4f18337d3 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -127,6 +127,7 @@ extern int show_line_range(struct line_range *lr, const char *module);
 extern int show_available_vars(struct perf_probe_event *pevs, int npevs,
 			       int max_probe_points, const char *module,
 			       bool externs);
+extern int show_available_funcs(const char *module);
 
 
 /* Maximum index number of event-name postfix */
-- 
cgit v1.2.3


From d7065adb9b4f3384c2615f0a3dbdb6c3aae1eb18 Mon Sep 17 00:00:00 2001
From: Franck Bui-Huu <fbuihuu@gmail.com>
Date: Sun, 16 Jan 2011 17:14:45 +0100
Subject: perf record: auto detect when stdout is a pipe

This patch gives the ability to 'perf record' to detect when its stdout
has been redirected to a pipe. There's now no more need to add '-o -'
switch in this case.

However '-o <path>' option has always precedence, that is if specified
and stdout has been connected via a pipe then the output will go into
the specified output.

LKML-Reference: <m3ipxo966i.fsf@gmail.com>
Signed-off-by: Franck Bui-Huu <fbuihuu@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-record.c | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 45a3689f9ed6..1346d4230bc0 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -48,7 +48,7 @@ static unsigned int		user_freq 			= UINT_MAX;
 static int			freq				=   1000;
 static int			output;
 static int			pipe_output			=      0;
-static const char		*output_name			= "perf.data";
+static const char		*output_name			= NULL;
 static int			group				=      0;
 static int			realtime_prio			=      0;
 static bool			nodelay				=  false;
@@ -497,18 +497,26 @@ static int __cmd_record(int argc, const char **argv)
 		exit(-1);
 	}
 
-	if (!strcmp(output_name, "-"))
-		pipe_output = 1;
-	else if (!stat(output_name, &st) && st.st_size) {
-		if (write_mode == WRITE_FORCE) {
-			char oldname[PATH_MAX];
-			snprintf(oldname, sizeof(oldname), "%s.old",
-				 output_name);
-			unlink(oldname);
-			rename(output_name, oldname);
+	if (!output_name) {
+		if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
+			pipe_output = 1;
+		else
+			output_name = "perf.data";
+	}
+	if (output_name) {
+		if (!strcmp(output_name, "-"))
+			pipe_output = 1;
+		else if (!stat(output_name, &st) && st.st_size) {
+			if (write_mode == WRITE_FORCE) {
+				char oldname[PATH_MAX];
+				snprintf(oldname, sizeof(oldname), "%s.old",
+					 output_name);
+				unlink(oldname);
+				rename(output_name, oldname);
+			}
+		} else if (write_mode == WRITE_APPEND) {
+			write_mode = WRITE_FORCE;
 		}
-	} else if (write_mode == WRITE_APPEND) {
-		write_mode = WRITE_FORCE;
 	}
 
 	flags = O_CREAT|O_RDWR;
-- 
cgit v1.2.3


From 17ea1b70a87e28457821318341bead2b45563092 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 17 Jan 2011 14:40:46 -0200
Subject: perf tools: Pass the struct opt to the wildcard parsing routine

It is needed because it will call parse_event for each tracepoint
name that matches, and we pass the perf_evlist via opt->value.

Problem introduced in 4503fdd where my assumption about opt being
always non NULL made me not look at callers of parse_events outside
builtin-*.c.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/parse-events.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index d3086cecd2dd..cf082daa43e3 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -446,8 +446,8 @@ parse_single_tracepoint_event(char *sys_name,
 /* sys + ':' + event + ':' + flags*/
 #define MAX_EVOPT_LEN	(MAX_EVENT_LENGTH * 2 + 2 + 128)
 static enum event_result
-parse_multiple_tracepoint_event(char *sys_name, const char *evt_exp,
-				char *flags)
+parse_multiple_tracepoint_event(const struct option *opt, char *sys_name,
+				const char *evt_exp, char *flags)
 {
 	char evt_path[MAXPATHLEN];
 	struct dirent *evt_ent;
@@ -480,15 +480,16 @@ parse_multiple_tracepoint_event(char *sys_name, const char *evt_exp,
 		if (len < 0)
 			return EVT_FAILED;
 
-		if (parse_events(NULL, event_opt, 0))
+		if (parse_events(opt, event_opt, 0))
 			return EVT_FAILED;
 	}
 
 	return EVT_HANDLED_ALL;
 }
 
-static enum event_result parse_tracepoint_event(const char **strp,
-				    struct perf_event_attr *attr)
+static enum event_result
+parse_tracepoint_event(const struct option *opt, const char **strp,
+		       struct perf_event_attr *attr)
 {
 	const char *evt_name;
 	char *flags = NULL, *comma_loc;
@@ -527,7 +528,7 @@ static enum event_result parse_tracepoint_event(const char **strp,
 		return EVT_FAILED;
 	if (strpbrk(evt_name, "*?")) {
 		*strp += strlen(sys_name) + evt_length + 1; /* 1 == the ':' */
-		return parse_multiple_tracepoint_event(sys_name, evt_name,
+		return parse_multiple_tracepoint_event(opt, sys_name, evt_name,
 						       flags);
 	} else {
 		return parse_single_tracepoint_event(sys_name, evt_name,
@@ -737,11 +738,12 @@ parse_event_modifier(const char **strp, struct perf_event_attr *attr)
  * Symbolic names are (almost) exactly matched.
  */
 static enum event_result
-parse_event_symbols(const char **str, struct perf_event_attr *attr)
+parse_event_symbols(const struct option *opt, const char **str,
+		    struct perf_event_attr *attr)
 {
 	enum event_result ret;
 
-	ret = parse_tracepoint_event(str, attr);
+	ret = parse_tracepoint_event(opt, str, attr);
 	if (ret != EVT_FAILED)
 		goto modifier;
 
@@ -783,7 +785,7 @@ int parse_events(const struct option *opt, const char *str, int unset __used)
 
 	for (;;) {
 		memset(&attr, 0, sizeof(attr));
-		ret = parse_event_symbols(&str, &attr);
+		ret = parse_event_symbols(opt, &str, &attr);
 		if (ret == EVT_FAILED)
 			return -1;
 
-- 
cgit v1.2.3


From fd78260b5376173faeb17127bd63b3c99a8e8bfb Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 18 Jan 2011 15:15:24 -0200
Subject: perf threads: Move thread_map to separate file

To untangle it from struct thread handling, that is tied to symbols, etc.

Right now in the python bindings I'm working on I need just a subset of
the util/ files, untangling it allows me to do that.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile          |  2 ++
 tools/perf/builtin-record.c  |  1 +
 tools/perf/builtin-stat.c    |  1 +
 tools/perf/builtin-test.c    |  2 +-
 tools/perf/builtin-top.c     |  1 +
 tools/perf/util/evsel.c      |  2 +-
 tools/perf/util/thread.c     | 55 -------------------------------------
 tools/perf/util/thread.h     | 14 ----------
 tools/perf/util/thread_map.c | 64 ++++++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/thread_map.h | 15 +++++++++++
 10 files changed, 86 insertions(+), 71 deletions(-)
 create mode 100644 tools/perf/util/thread_map.c
 create mode 100644 tools/perf/util/thread_map.h

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index f20bc6f85611..638e8e146bb9 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -426,6 +426,7 @@ LIB_H += util/values.h
 LIB_H += util/sort.h
 LIB_H += util/hist.h
 LIB_H += util/thread.h
+LIB_H += util/thread_map.h
 LIB_H += util/trace-event.h
 LIB_H += util/probe-finder.h
 LIB_H += util/probe-event.h
@@ -471,6 +472,7 @@ LIB_OBJS += $(OUTPUT)util/map.o
 LIB_OBJS += $(OUTPUT)util/pstack.o
 LIB_OBJS += $(OUTPUT)util/session.o
 LIB_OBJS += $(OUTPUT)util/thread.o
+LIB_OBJS += $(OUTPUT)util/thread_map.o
 LIB_OBJS += $(OUTPUT)util/trace-event-parse.o
 LIB_OBJS += $(OUTPUT)util/trace-event-read.o
 LIB_OBJS += $(OUTPUT)util/trace-event-info.o
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 1346d4230bc0..d7886307f6f4 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -24,6 +24,7 @@
 #include "util/session.h"
 #include "util/symbol.h"
 #include "util/cpumap.h"
+#include "util/thread_map.h"
 
 #include <unistd.h>
 #include <sched.h>
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index e2a2d02c5c43..8906adfdbd8e 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -49,6 +49,7 @@
 #include "util/header.h"
 #include "util/cpumap.h"
 #include "util/thread.h"
+#include "util/thread_map.h"
 
 #include <sys/prctl.h>
 #include <math.h>
diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c
index 4fd34537c01d..dc91ee06a37c 100644
--- a/tools/perf/builtin-test.c
+++ b/tools/perf/builtin-test.c
@@ -12,7 +12,7 @@
 #include "util/parse-events.h"
 #include "util/session.h"
 #include "util/symbol.h"
-#include "util/thread.h"
+#include "util/thread_map.h"
 
 static long page_size;
 
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 31fbaf38d9c1..d0b16d905405 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -26,6 +26,7 @@
 #include "util/session.h"
 #include "util/symbol.h"
 #include "util/thread.h"
+#include "util/thread_map.h"
 #include "util/util.h"
 #include <linux/rbtree.h>
 #include "util/parse-options.h"
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index ee490356c817..9a6d94299ab8 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -3,7 +3,7 @@
 #include "../perf.h"
 #include "util.h"
 #include "cpumap.h"
-#include "thread.h"
+#include "thread_map.h"
 
 #include <unistd.h>
 #include <sys/mman.h>
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index 00f4eade2e3e..d5d3b22250f3 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -7,61 +7,6 @@
 #include "util.h"
 #include "debug.h"
 
-/* Skip "." and ".." directories */
-static int filter(const struct dirent *dir)
-{
-	if (dir->d_name[0] == '.')
-		return 0;
-	else
-		return 1;
-}
-
-struct thread_map *thread_map__new_by_pid(pid_t pid)
-{
-	struct thread_map *threads;
-	char name[256];
-	int items;
-	struct dirent **namelist = NULL;
-	int i;
-
-	sprintf(name, "/proc/%d/task", pid);
-	items = scandir(name, &namelist, filter, NULL);
-	if (items <= 0)
-                return NULL;
-
-	threads = malloc(sizeof(*threads) + sizeof(pid_t) * items);
-	if (threads != NULL) {
-		for (i = 0; i < items; i++)
-			threads->map[i] = atoi(namelist[i]->d_name);
-		threads->nr = items;
-	}
-
-	for (i=0; i<items; i++)
-		free(namelist[i]);
-	free(namelist);
-
-	return threads;
-}
-
-struct thread_map *thread_map__new_by_tid(pid_t tid)
-{
-	struct thread_map *threads = malloc(sizeof(*threads) + sizeof(pid_t));
-
-	if (threads != NULL) {
-		threads->map[0] = tid;
-		threads->nr	= 1;
-	}
-
-	return threads;
-}
-
-struct thread_map *thread_map__new(pid_t pid, pid_t tid)
-{
-	if (pid != -1)
-		return thread_map__new_by_pid(pid);
-	return thread_map__new_by_tid(tid);
-}
-
 static struct thread *thread__new(pid_t pid)
 {
 	struct thread *self = zalloc(sizeof(*self));
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index d7574101054a..e5f2401c1b5e 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -18,24 +18,10 @@ struct thread {
 	int			comm_len;
 };
 
-struct thread_map {
-	int nr;
-	int map[];
-};
-
 struct perf_session;
 
 void thread__delete(struct thread *self);
 
-struct thread_map *thread_map__new_by_pid(pid_t pid);
-struct thread_map *thread_map__new_by_tid(pid_t tid);
-struct thread_map *thread_map__new(pid_t pid, pid_t tid);
-
-static inline void thread_map__delete(struct thread_map *threads)
-{
-	free(threads);
-}
-
 int thread__set_comm(struct thread *self, const char *comm);
 int thread__comm_len(struct thread *self);
 struct thread *perf_session__findnew(struct perf_session *self, pid_t pid);
diff --git a/tools/perf/util/thread_map.c b/tools/perf/util/thread_map.c
new file mode 100644
index 000000000000..a5df131b77c3
--- /dev/null
+++ b/tools/perf/util/thread_map.c
@@ -0,0 +1,64 @@
+#include <dirent.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "thread_map.h"
+
+/* Skip "." and ".." directories */
+static int filter(const struct dirent *dir)
+{
+	if (dir->d_name[0] == '.')
+		return 0;
+	else
+		return 1;
+}
+
+struct thread_map *thread_map__new_by_pid(pid_t pid)
+{
+	struct thread_map *threads;
+	char name[256];
+	int items;
+	struct dirent **namelist = NULL;
+	int i;
+
+	sprintf(name, "/proc/%d/task", pid);
+	items = scandir(name, &namelist, filter, NULL);
+	if (items <= 0)
+                return NULL;
+
+	threads = malloc(sizeof(*threads) + sizeof(pid_t) * items);
+	if (threads != NULL) {
+		for (i = 0; i < items; i++)
+			threads->map[i] = atoi(namelist[i]->d_name);
+		threads->nr = items;
+	}
+
+	for (i=0; i<items; i++)
+		free(namelist[i]);
+	free(namelist);
+
+	return threads;
+}
+
+struct thread_map *thread_map__new_by_tid(pid_t tid)
+{
+	struct thread_map *threads = malloc(sizeof(*threads) + sizeof(pid_t));
+
+	if (threads != NULL) {
+		threads->map[0] = tid;
+		threads->nr	= 1;
+	}
+
+	return threads;
+}
+
+struct thread_map *thread_map__new(pid_t pid, pid_t tid)
+{
+	if (pid != -1)
+		return thread_map__new_by_pid(pid);
+	return thread_map__new_by_tid(tid);
+}
+
+void thread_map__delete(struct thread_map *threads)
+{
+	free(threads);
+}
diff --git a/tools/perf/util/thread_map.h b/tools/perf/util/thread_map.h
new file mode 100644
index 000000000000..3cb907311409
--- /dev/null
+++ b/tools/perf/util/thread_map.h
@@ -0,0 +1,15 @@
+#ifndef __PERF_THREAD_MAP_H
+#define __PERF_THREAD_MAP_H
+
+#include <sys/types.h>
+
+struct thread_map {
+	int nr;
+	int map[];
+};
+
+struct thread_map *thread_map__new_by_pid(pid_t pid);
+struct thread_map *thread_map__new_by_tid(pid_t tid);
+struct thread_map *thread_map__new(pid_t pid, pid_t tid);
+void thread_map__delete(struct thread_map *threads);
+#endif	/* __PERF_THREAD_MAP_H */
-- 
cgit v1.2.3


From d0dd74e853a0a6f37e8061d6d50be41c7034c54c Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 21 Jan 2011 13:46:41 -0200
Subject: perf tools: Move event__parse_sample to evsel.c

To avoid linking more stuff in the python binding I'm working on, future
csets will make the sample type be taken from the evsel itself, but for
that we need to first have one file per cpu and per sample_type, not a
single perf.data file.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-test.c |  11 +---
 tools/perf/builtin-top.c  |   2 +-
 tools/perf/util/event.c   | 125 ----------------------------------------------
 tools/perf/util/event.h   |   5 +-
 tools/perf/util/evsel.c   | 118 +++++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/session.c |   4 +-
 tools/perf/util/session.h |   9 ++++
 7 files changed, 134 insertions(+), 140 deletions(-)

diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c
index dc91ee06a37c..231e3e21810c 100644
--- a/tools/perf/builtin-test.c
+++ b/tools/perf/builtin-test.c
@@ -10,7 +10,6 @@
 #include "util/evlist.h"
 #include "util/parse-options.h"
 #include "util/parse-events.h"
-#include "util/session.h"
 #include "util/symbol.h"
 #include "util/thread_map.h"
 
@@ -457,7 +456,6 @@ static int test__basic_mmap(void)
 	int err = -1;
 	event_t *event;
 	struct thread_map *threads;
-	struct perf_session session;
 	struct cpu_map *cpus;
 	struct perf_evlist *evlist;
 	struct perf_event_attr attr = {
@@ -521,13 +519,6 @@ static int test__basic_mmap(void)
 	attr.wakeup_events = 1;
 	attr.sample_period = 1;
 
-	/*
- 	 * FIXME: use evsel->attr.sample_type in event__parse_sample.
- 	 * 	  This will nicely remove the requirement that we have
- 	 * 	  all the events with the same sample_type.
- 	 */
-	session.sample_type = attr.sample_type;
-
 	for (i = 0; i < nsyscalls; ++i) {
 		attr.config = ids[i];
 		evsels[i] = perf_evsel__new(&attr, i);
@@ -567,7 +558,7 @@ static int test__basic_mmap(void)
 			goto out_munmap;
 		}
 
-		event__parse_sample(event, &session, &sample);
+		event__parse_sample(event, attr.sample_type, false, &sample);
 		evsel = perf_evlist__id2evsel(evlist, sample.id);
 		if (evsel == NULL) {
 			pr_debug("event with id %" PRIu64
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index d0b16d905405..ce2e50c891c7 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1106,7 +1106,7 @@ static void perf_session__mmap_read_cpu(struct perf_session *self, int cpu)
 	event_t *event;
 
 	while ((event = perf_evlist__read_on_cpu(evsel_list, cpu)) != NULL) {
-		event__parse_sample(event, self, &sample);
+		perf_session__parse_sample(self, event, &sample);
 
 		if (event->header.type == PERF_RECORD_SAMPLE)
 			event__process_sample(event, &sample, self);
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 1478ab4ee222..e4db8b888546 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -826,128 +826,3 @@ out_filtered:
 	al->filtered = true;
 	return 0;
 }
-
-static int event__parse_id_sample(const event_t *event,
-				  struct perf_session *session,
-				  struct sample_data *sample)
-{
-	const u64 *array;
-	u64 type;
-
-	sample->cpu = sample->pid = sample->tid = -1;
-	sample->stream_id = sample->id = sample->time = -1ULL;
-
-	if (!session->sample_id_all)
-		return 0;
-
-	array = event->sample.array;
-	array += ((event->header.size -
-		   sizeof(event->header)) / sizeof(u64)) - 1;
-	type = session->sample_type;
-
-	if (type & PERF_SAMPLE_CPU) {
-		u32 *p = (u32 *)array;
-		sample->cpu = *p;
-		array--;
-	}
-
-	if (type & PERF_SAMPLE_STREAM_ID) {
-		sample->stream_id = *array;
-		array--;
-	}
-
-	if (type & PERF_SAMPLE_ID) {
-		sample->id = *array;
-		array--;
-	}
-
-	if (type & PERF_SAMPLE_TIME) {
-		sample->time = *array;
-		array--;
-	}
-
-	if (type & PERF_SAMPLE_TID) {
-		u32 *p = (u32 *)array;
-		sample->pid = p[0];
-		sample->tid = p[1];
-	}
-
-	return 0;
-}
-
-int event__parse_sample(const event_t *event, struct perf_session *session,
-			struct sample_data *data)
-{
-	const u64 *array;
-	u64 type;
-
-	if (event->header.type != PERF_RECORD_SAMPLE)
-		return event__parse_id_sample(event, session, data);
-
-	array = event->sample.array;
-	type = session->sample_type;
-
-	if (type & PERF_SAMPLE_IP) {
-		data->ip = event->ip.ip;
-		array++;
-	}
-
-	if (type & PERF_SAMPLE_TID) {
-		u32 *p = (u32 *)array;
-		data->pid = p[0];
-		data->tid = p[1];
-		array++;
-	}
-
-	if (type & PERF_SAMPLE_TIME) {
-		data->time = *array;
-		array++;
-	}
-
-	if (type & PERF_SAMPLE_ADDR) {
-		data->addr = *array;
-		array++;
-	}
-
-	data->id = -1ULL;
-	if (type & PERF_SAMPLE_ID) {
-		data->id = *array;
-		array++;
-	}
-
-	if (type & PERF_SAMPLE_STREAM_ID) {
-		data->stream_id = *array;
-		array++;
-	}
-
-	if (type & PERF_SAMPLE_CPU) {
-		u32 *p = (u32 *)array;
-		data->cpu = *p;
-		array++;
-	} else
-		data->cpu = -1;
-
-	if (type & PERF_SAMPLE_PERIOD) {
-		data->period = *array;
-		array++;
-	}
-
-	if (type & PERF_SAMPLE_READ) {
-		pr_debug("PERF_SAMPLE_READ is unsuported for now\n");
-		return -1;
-	}
-
-	if (type & PERF_SAMPLE_CALLCHAIN) {
-		data->callchain = (struct ip_callchain *)array;
-		array += 1 + data->callchain->nr;
-	}
-
-	if (type & PERF_SAMPLE_RAW) {
-		u32 *p = (u32 *)array;
-		data->raw_size = *p;
-		p++;
-		data->raw_data = p;
-	}
-
-	return 0;
-}
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 2b7e91902f10..d79e4edd82f9 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -169,9 +169,10 @@ struct addr_location;
 int event__preprocess_sample(const event_t *self, struct perf_session *session,
 			     struct addr_location *al, struct sample_data *data,
 			     symbol_filter_t filter);
-int event__parse_sample(const event_t *event, struct perf_session *session,
-			struct sample_data *sample);
 
 const char *event__get_event_name(unsigned int id);
 
+int event__parse_sample(const event_t *event, u64 type, bool sample_id_all,
+			struct sample_data *sample);
+
 #endif /* __PERF_RECORD_H */
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 9a6d94299ab8..a85ae12845ea 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -355,3 +355,121 @@ out_unmap:
 	}
 	return -1;
 }
+
+static int event__parse_id_sample(const event_t *event, u64 type,
+				  struct sample_data *sample)
+{
+	const u64 *array = event->sample.array;
+
+	array += ((event->header.size -
+		   sizeof(event->header)) / sizeof(u64)) - 1;
+
+	if (type & PERF_SAMPLE_CPU) {
+		u32 *p = (u32 *)array;
+		sample->cpu = *p;
+		array--;
+	}
+
+	if (type & PERF_SAMPLE_STREAM_ID) {
+		sample->stream_id = *array;
+		array--;
+	}
+
+	if (type & PERF_SAMPLE_ID) {
+		sample->id = *array;
+		array--;
+	}
+
+	if (type & PERF_SAMPLE_TIME) {
+		sample->time = *array;
+		array--;
+	}
+
+	if (type & PERF_SAMPLE_TID) {
+		u32 *p = (u32 *)array;
+		sample->pid = p[0];
+		sample->tid = p[1];
+	}
+
+	return 0;
+}
+
+int event__parse_sample(const event_t *event, u64 type, bool sample_id_all,
+			struct sample_data *data)
+{
+	const u64 *array;
+
+	data->cpu = data->pid = data->tid = -1;
+	data->stream_id = data->id = data->time = -1ULL;
+
+	if (event->header.type != PERF_RECORD_SAMPLE) {
+		if (!sample_id_all)
+			return 0;
+		return event__parse_id_sample(event, type, data);
+	}
+
+	array = event->sample.array;
+
+	if (type & PERF_SAMPLE_IP) {
+		data->ip = event->ip.ip;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_TID) {
+		u32 *p = (u32 *)array;
+		data->pid = p[0];
+		data->tid = p[1];
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_TIME) {
+		data->time = *array;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_ADDR) {
+		data->addr = *array;
+		array++;
+	}
+
+	data->id = -1ULL;
+	if (type & PERF_SAMPLE_ID) {
+		data->id = *array;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_STREAM_ID) {
+		data->stream_id = *array;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_CPU) {
+		u32 *p = (u32 *)array;
+		data->cpu = *p;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_PERIOD) {
+		data->period = *array;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_READ) {
+		fprintf(stderr, "PERF_SAMPLE_READ is unsuported for now\n");
+		return -1;
+	}
+
+	if (type & PERF_SAMPLE_CALLCHAIN) {
+		data->callchain = (struct ip_callchain *)array;
+		array += 1 + data->callchain->nr;
+	}
+
+	if (type & PERF_SAMPLE_RAW) {
+		u32 *p = (u32 *)array;
+		data->raw_size = *p;
+		p++;
+		data->raw_data = p;
+	}
+
+	return 0;
+}
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index b58a48a5e5a9..e6a07408669e 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -496,7 +496,7 @@ static void flush_sample_queue(struct perf_session *s,
 		if (iter->timestamp > limit)
 			break;
 
-		event__parse_sample(iter->event, s, &sample);
+		perf_session__parse_sample(s, iter->event, &sample);
 		perf_session_deliver_event(s, iter->event, &sample, ops,
 					   iter->file_offset);
 
@@ -806,7 +806,7 @@ static int perf_session__process_event(struct perf_session *session,
 	/*
 	 * For all kernel events we get the sample data
 	 */
-	event__parse_sample(event, session, &sample);
+	perf_session__parse_sample(session, event, &sample);
 
 	/* Preprocess sample records - precheck callchains */
 	if (perf_session__preprocess_sample(session, event, &sample))
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index e815468eb888..78239767011e 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -155,4 +155,13 @@ size_t perf_session__fprintf_nr_events(struct perf_session *self, FILE *fp)
 {
 	return hists__fprintf_nr_events(&self->hists, fp);
 }
+
+static inline int perf_session__parse_sample(struct perf_session *session,
+					     const event_t *event,
+					     struct sample_data *sample)
+{
+	return event__parse_sample(event, session->sample_type,
+				   session->sample_id_all, sample);
+}
+
 #endif /* __PERF_SESSION_H */
-- 
cgit v1.2.3


From ef1d1af28ca37fdbc2745da040529cd2953c1af5 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 18 Jan 2011 21:41:45 -0200
Subject: perf evsel: Introduce perf_evsel__{in,ex}it

Out of the {con,des}structor, as in interpreted language bindings we will
need to go back from the wrapper object to the real thing. In that case
using container_of will save us to have an extra pointer in the perf_evsel
struct.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evlist.c | 29 ++++++++++++++++++++---------
 tools/perf/util/evlist.h |  2 ++
 tools/perf/util/evsel.c  | 22 ++++++++++++++++------
 tools/perf/util/evsel.h  |  3 +++
 4 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 4b3b84cd71a1..df0610e9c61b 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -6,17 +6,21 @@
 #include <linux/bitops.h>
 #include <linux/hash.h>
 
+void perf_evlist__init(struct perf_evlist *evlist)
+{
+	int i;
+
+	for (i = 0; i < PERF_EVLIST__HLIST_SIZE; ++i)
+		INIT_HLIST_HEAD(&evlist->heads[i]);
+	INIT_LIST_HEAD(&evlist->entries);
+}
+
 struct perf_evlist *perf_evlist__new(void)
 {
 	struct perf_evlist *evlist = zalloc(sizeof(*evlist));
 
-	if (evlist != NULL) {
-		int i;
-
-		for (i = 0; i < PERF_EVLIST__HLIST_SIZE; ++i)
-			INIT_HLIST_HEAD(&evlist->heads[i]);
-		INIT_LIST_HEAD(&evlist->entries);
-	}
+	if (evlist != NULL)
+		perf_evlist__init(evlist);
 
 	return evlist;
 }
@@ -33,11 +37,18 @@ static void perf_evlist__purge(struct perf_evlist *evlist)
 	evlist->nr_entries = 0;
 }
 
-void perf_evlist__delete(struct perf_evlist *evlist)
+void perf_evlist__exit(struct perf_evlist *evlist)
 {
-	perf_evlist__purge(evlist);
 	free(evlist->mmap);
 	free(evlist->pollfd);
+	evlist->mmap = NULL;
+	evlist->pollfd = NULL;
+}
+
+void perf_evlist__delete(struct perf_evlist *evlist)
+{
+	perf_evlist__purge(evlist);
+	perf_evlist__exit(evlist);
 	free(evlist);
 }
 
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 28712063db97..acbe48eac608 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -24,6 +24,8 @@ struct perf_evlist {
 struct perf_evsel;
 
 struct perf_evlist *perf_evlist__new(void);
+void perf_evlist__init(struct perf_evlist *evlist);
+void perf_evlist__exit(struct perf_evlist *evlist);
 void perf_evlist__delete(struct perf_evlist *evlist);
 
 void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry);
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index a85ae12845ea..76ab553637d6 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -14,15 +14,20 @@
 #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
 #define SID(e, x, y) xyarray__entry(e->id, x, y)
 
+void perf_evsel__init(struct perf_evsel *evsel,
+		      struct perf_event_attr *attr, int idx)
+{
+	evsel->idx	   = idx;
+	evsel->attr	   = *attr;
+	INIT_LIST_HEAD(&evsel->node);
+}
+
 struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx)
 {
 	struct perf_evsel *evsel = zalloc(sizeof(*evsel));
 
-	if (evsel != NULL) {
-		evsel->idx	   = idx;
-		evsel->attr	   = *attr;
-		INIT_LIST_HEAD(&evsel->node);
-	}
+	if (evsel != NULL)
+		perf_evsel__init(evsel, attr, idx);
 
 	return evsel;
 }
@@ -87,11 +92,16 @@ int perf_evlist__alloc_mmap(struct perf_evlist *evlist, int ncpus)
 	return evlist->mmap != NULL ? 0 : -ENOMEM;
 }
 
-void perf_evsel__delete(struct perf_evsel *evsel)
+void perf_evsel__exit(struct perf_evsel *evsel)
 {
 	assert(list_empty(&evsel->node));
 	xyarray__delete(evsel->fd);
 	xyarray__delete(evsel->id);
+}
+
+void perf_evsel__delete(struct perf_evsel *evsel)
+{
+	perf_evsel__exit(evsel);
 	free(evsel);
 }
 
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 667ee4e2e35e..7962e7587dea 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -52,6 +52,9 @@ struct thread_map;
 struct perf_evlist;
 
 struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx);
+void perf_evsel__init(struct perf_evsel *evsel,
+		      struct perf_event_attr *attr, int idx);
+void perf_evsel__exit(struct perf_evsel *evsel);
 void perf_evsel__delete(struct perf_evsel *evsel);
 
 int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
-- 
cgit v1.2.3


From 9599ec0471deae24044241e2173090d2cbc0e899 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 17 Jan 2011 17:39:15 -0800
Subject: x86-64, mem: Convert memmove() to assembly file and fix return value
 bug

memmove_64.c only implements memmove() function which is completely written in
inline assembly code. Therefore it doesn't make sense to keep the assembly code
in .c file.

Currently memmove() doesn't store return value to rax. This may cause issue if
caller uses the return value. The patch fixes this issue.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
LKML-Reference: <1295314755-6625-1-git-send-email-fenghua.yu@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/x8664_ksyms_64.c |   1 +
 arch/x86/lib/memmove_64.S        | 197 +++++++++++++++++++++++++++++++++++++++
 arch/x86/lib/memmove_64.c        | 192 --------------------------------------
 3 files changed, 198 insertions(+), 192 deletions(-)
 create mode 100644 arch/x86/lib/memmove_64.S
 delete mode 100644 arch/x86/lib/memmove_64.c

diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 1b950d151e58..9796c2f3d074 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -52,6 +52,7 @@ extern void *__memcpy(void *, const void *, __kernel_size_t);
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(__memcpy);
+EXPORT_SYMBOL(memmove);
 
 EXPORT_SYMBOL(empty_zero_page);
 #ifndef CONFIG_PARAVIRT
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
new file mode 100644
index 000000000000..0ecb8433e5a8
--- /dev/null
+++ b/arch/x86/lib/memmove_64.S
@@ -0,0 +1,197 @@
+/*
+ * Normally compiler builtins are used, but sometimes the compiler calls out
+ * of line code. Based on asm-i386/string.h.
+ *
+ * This assembly file is re-written from memmove_64.c file.
+ *	- Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
+ */
+#define _STRING_C
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+#undef memmove
+
+/*
+ * Implement memmove(). This can handle overlap between src and dst.
+ *
+ * Input:
+ * rdi: dest
+ * rsi: src
+ * rdx: count
+ *
+ * Output:
+ * rax: dest
+ */
+ENTRY(memmove)
+	CFI_STARTPROC
+	/* Handle more 32bytes in loop */
+	mov %rdi, %rax
+	cmp $0x20, %rdx
+	jb	1f
+
+	/* Decide forward/backward copy mode */
+	cmp %rdi, %rsi
+	jb	2f
+
+	/*
+	 * movsq instruction have many startup latency
+	 * so we handle small size by general register.
+	 */
+	cmp  $680, %rdx
+	jb	3f
+	/*
+	 * movsq instruction is only good for aligned case.
+	 */
+
+	cmpb %dil, %sil
+	je 4f
+3:
+	sub $0x20, %rdx
+	/*
+	 * We gobble 32byts forward in each loop.
+	 */
+5:
+	sub $0x20, %rdx
+	movq 0*8(%rsi), %r11
+	movq 1*8(%rsi), %r10
+	movq 2*8(%rsi), %r9
+	movq 3*8(%rsi), %r8
+	leaq 4*8(%rsi), %rsi
+
+	movq %r11, 0*8(%rdi)
+	movq %r10, 1*8(%rdi)
+	movq %r9, 2*8(%rdi)
+	movq %r8, 3*8(%rdi)
+	leaq 4*8(%rdi), %rdi
+	jae 5b
+	addq $0x20, %rdx
+	jmp 1f
+	/*
+	 * Handle data forward by movsq.
+	 */
+	.p2align 4
+4:
+	movq %rdx, %rcx
+	movq -8(%rsi, %rdx), %r11
+	lea -8(%rdi, %rdx), %r10
+	shrq $3, %rcx
+	rep movsq
+	movq %r11, (%r10)
+	jmp 13f
+	/*
+	 * Handle data backward by movsq.
+	 */
+	.p2align 4
+7:
+	movq %rdx, %rcx
+	movq (%rsi), %r11
+	movq %rdi, %r10
+	leaq -8(%rsi, %rdx), %rsi
+	leaq -8(%rdi, %rdx), %rdi
+	shrq $3, %rcx
+	std
+	rep movsq
+	cld
+	movq %r11, (%r10)
+	jmp 13f
+
+	/*
+	 * Start to prepare for backward copy.
+	 */
+	.p2align 4
+2:
+	cmp $680, %rdx
+	jb 6f
+	cmp %dil, %sil
+	je 7b
+6:
+	/*
+	 * Calculate copy position to tail.
+	 */
+	addq %rdx, %rsi
+	addq %rdx, %rdi
+	subq $0x20, %rdx
+	/*
+	 * We gobble 32byts backward in each loop.
+	 */
+8:
+	subq $0x20, %rdx
+	movq -1*8(%rsi), %r11
+	movq -2*8(%rsi), %r10
+	movq -3*8(%rsi), %r9
+	movq -4*8(%rsi), %r8
+	leaq -4*8(%rsi), %rsi
+
+	movq %r11, -1*8(%rdi)
+	movq %r10, -2*8(%rdi)
+	movq %r9, -3*8(%rdi)
+	movq %r8, -4*8(%rdi)
+	leaq -4*8(%rdi), %rdi
+	jae 8b
+	/*
+	 * Calculate copy position to head.
+	 */
+	addq $0x20, %rdx
+	subq %rdx, %rsi
+	subq %rdx, %rdi
+1:
+	cmpq $16, %rdx
+	jb 9f
+	/*
+	 * Move data from 16 bytes to 31 bytes.
+	 */
+	movq 0*8(%rsi), %r11
+	movq 1*8(%rsi), %r10
+	movq -2*8(%rsi, %rdx), %r9
+	movq -1*8(%rsi, %rdx), %r8
+	movq %r11, 0*8(%rdi)
+	movq %r10, 1*8(%rdi)
+	movq %r9, -2*8(%rdi, %rdx)
+	movq %r8, -1*8(%rdi, %rdx)
+	jmp 13f
+	.p2align 4
+9:
+	cmpq $8, %rdx
+	jb 10f
+	/*
+	 * Move data from 8 bytes to 15 bytes.
+	 */
+	movq 0*8(%rsi), %r11
+	movq -1*8(%rsi, %rdx), %r10
+	movq %r11, 0*8(%rdi)
+	movq %r10, -1*8(%rdi, %rdx)
+	jmp 13f
+10:
+	cmpq $4, %rdx
+	jb 11f
+	/*
+	 * Move data from 4 bytes to 7 bytes.
+	 */
+	movl (%rsi), %r11d
+	movl -4(%rsi, %rdx), %r10d
+	movl %r11d, (%rdi)
+	movl %r10d, -4(%rdi, %rdx)
+	jmp 13f
+11:
+	cmp $2, %rdx
+	jb 12f
+	/*
+	 * Move data from 2 bytes to 3 bytes.
+	 */
+	movw (%rsi), %r11w
+	movw -2(%rsi, %rdx), %r10w
+	movw %r11w, (%rdi)
+	movw %r10w, -2(%rdi, %rdx)
+	jmp 13f
+12:
+	cmp $1, %rdx
+	jb 13f
+	/*
+	 * Move data for 1 byte.
+	 */
+	movb (%rsi), %r11b
+	movb %r11b, (%rdi)
+13:
+	retq
+	CFI_ENDPROC
+ENDPROC(memmove)
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
deleted file mode 100644
index 6d0f0ec41b34..000000000000
--- a/arch/x86/lib/memmove_64.c
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Normally compiler builtins are used, but sometimes the compiler calls out
-   of line code. Based on asm-i386/string.h.
- */
-#define _STRING_C
-#include <linux/string.h>
-#include <linux/module.h>
-
-#undef memmove
-void *memmove(void *dest, const void *src, size_t count)
-{
-	unsigned long d0,d1,d2,d3,d4,d5,d6,d7;
-	char *ret;
-
-	__asm__ __volatile__(
-		/* Handle more 32bytes in loop */
-		"mov %2, %3\n\t"
-		"cmp $0x20, %0\n\t"
-		"jb	1f\n\t"
-
-		/* Decide forward/backward copy mode */
-		"cmp %2, %1\n\t"
-		"jb	2f\n\t"
-
-		/*
-		 * movsq instruction have many startup latency
-		 * so we handle small size by general register.
-		 */
-		"cmp  $680, %0\n\t"
-		"jb 3f\n\t"
-		/*
-		 * movsq instruction is only good for aligned case.
-		 */
-		"cmpb %%dil, %%sil\n\t"
-		"je 4f\n\t"
-		"3:\n\t"
-		"sub $0x20, %0\n\t"
-		/*
-		 * We gobble 32byts forward in each loop.
-		 */
-		"5:\n\t"
-		"sub $0x20, %0\n\t"
-		"movq 0*8(%1), %4\n\t"
-		"movq 1*8(%1), %5\n\t"
-		"movq 2*8(%1), %6\n\t"
-		"movq 3*8(%1), %7\n\t"
-		"leaq 4*8(%1), %1\n\t"
-
-		"movq %4, 0*8(%2)\n\t"
-		"movq %5, 1*8(%2)\n\t"
-		"movq %6, 2*8(%2)\n\t"
-		"movq %7, 3*8(%2)\n\t"
-		"leaq 4*8(%2), %2\n\t"
-		"jae 5b\n\t"
-		"addq $0x20, %0\n\t"
-		"jmp 1f\n\t"
-		/*
-		 * Handle data forward by movsq.
-		 */
-		".p2align 4\n\t"
-		"4:\n\t"
-		"movq %0, %8\n\t"
-		"movq -8(%1, %0), %4\n\t"
-		"lea -8(%2, %0), %5\n\t"
-		"shrq $3, %8\n\t"
-		"rep movsq\n\t"
-		"movq %4, (%5)\n\t"
-		"jmp 13f\n\t"
-		/*
-		 * Handle data backward by movsq.
-		 */
-		".p2align 4\n\t"
-		"7:\n\t"
-		"movq %0, %8\n\t"
-		"movq (%1), %4\n\t"
-		"movq %2, %5\n\t"
-		"leaq -8(%1, %0), %1\n\t"
-		"leaq -8(%2, %0), %2\n\t"
-		"shrq $3, %8\n\t"
-		"std\n\t"
-		"rep movsq\n\t"
-		"cld\n\t"
-		"movq %4, (%5)\n\t"
-		"jmp 13f\n\t"
-
-		/*
-		 * Start to prepare for backward copy.
-		 */
-		".p2align 4\n\t"
-		"2:\n\t"
-		"cmp $680, %0\n\t"
-		"jb 6f \n\t"
-		"cmp %%dil, %%sil\n\t"
-		"je 7b \n\t"
-		"6:\n\t"
-		/*
-		 * Calculate copy position to tail.
-		 */
-		"addq %0, %1\n\t"
-		"addq %0, %2\n\t"
-		"subq $0x20, %0\n\t"
-		/*
-		 * We gobble 32byts backward in each loop.
-		 */
-		"8:\n\t"
-		"subq $0x20, %0\n\t"
-		"movq -1*8(%1), %4\n\t"
-		"movq -2*8(%1), %5\n\t"
-		"movq -3*8(%1), %6\n\t"
-		"movq -4*8(%1), %7\n\t"
-		"leaq -4*8(%1), %1\n\t"
-
-		"movq %4, -1*8(%2)\n\t"
-		"movq %5, -2*8(%2)\n\t"
-		"movq %6, -3*8(%2)\n\t"
-		"movq %7, -4*8(%2)\n\t"
-		"leaq -4*8(%2), %2\n\t"
-		"jae 8b\n\t"
-		/*
-		 * Calculate copy position to head.
-		 */
-		"addq $0x20, %0\n\t"
-		"subq %0, %1\n\t"
-		"subq %0, %2\n\t"
-		"1:\n\t"
-		"cmpq $16, %0\n\t"
-		"jb 9f\n\t"
-		/*
-		 * Move data from 16 bytes to 31 bytes.
-		 */
-		"movq 0*8(%1), %4\n\t"
-		"movq 1*8(%1), %5\n\t"
-		"movq -2*8(%1, %0), %6\n\t"
-		"movq -1*8(%1, %0), %7\n\t"
-		"movq %4, 0*8(%2)\n\t"
-		"movq %5, 1*8(%2)\n\t"
-		"movq %6, -2*8(%2, %0)\n\t"
-		"movq %7, -1*8(%2, %0)\n\t"
-		"jmp 13f\n\t"
-		".p2align 4\n\t"
-		"9:\n\t"
-		"cmpq $8, %0\n\t"
-		"jb 10f\n\t"
-		/*
-		 * Move data from 8 bytes to 15 bytes.
-		 */
-		"movq 0*8(%1), %4\n\t"
-		"movq -1*8(%1, %0), %5\n\t"
-		"movq %4, 0*8(%2)\n\t"
-		"movq %5, -1*8(%2, %0)\n\t"
-		"jmp 13f\n\t"
-		"10:\n\t"
-		"cmpq $4, %0\n\t"
-		"jb 11f\n\t"
-		/*
-		 * Move data from 4 bytes to 7 bytes.
-		 */
-		"movl (%1), %4d\n\t"
-		"movl -4(%1, %0), %5d\n\t"
-		"movl %4d, (%2)\n\t"
-		"movl %5d, -4(%2, %0)\n\t"
-		"jmp 13f\n\t"
-		"11:\n\t"
-		"cmp $2, %0\n\t"
-		"jb 12f\n\t"
-		/*
-		 * Move data from 2 bytes to 3 bytes.
-		 */
-		"movw (%1), %4w\n\t"
-		"movw -2(%1, %0), %5w\n\t"
-		"movw %4w, (%2)\n\t"
-		"movw %5w, -2(%2, %0)\n\t"
-		"jmp 13f\n\t"
-		"12:\n\t"
-		"cmp $1, %0\n\t"
-		"jb 13f\n\t"
-		/*
-		 * Move data for 1 byte.
-		 */
-		"movb (%1), %4b\n\t"
-		"movb %4b, (%2)\n\t"
-		"13:\n\t"
-		: "=&d" (d0), "=&S" (d1), "=&D" (d2), "=&a" (ret) ,
-		  "=r"(d3), "=r"(d4), "=r"(d5), "=r"(d6), "=&c" (d7)
-		:"0" (count),
-		 "1" (src),
-		 "2" (dest)
-		:"memory");
-
-		return ret;
-
-}
-EXPORT_SYMBOL(memmove);
-- 
cgit v1.2.3


From d518573de63fb119e5e9a3137386544671387681 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Mon, 24 Jan 2011 16:05:40 +0100
Subject: x86, amd: Normalize compute unit IDs on multi-node processors

On multi-node CPUs we don't need the socket wide compute unit ID
but the node-wide compute unit ID. Thus we need to normalize the
value. This is similar to what we do with cpu_core_id.

A compute unit is then identified by physical_package_id,
node_id, and compute_unit_id.

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
LKML-Reference: <1295881543-572552-2-git-send-email-hans.rosenfeld@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/amd.c | 8 ++++++--
 arch/x86/kernel/smpboot.c | 1 +
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 7c7bedb83c5a..990cc4861586 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -261,7 +261,7 @@ static int __cpuinit nearby_node(int apicid)
 #ifdef CONFIG_X86_HT
 static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
 {
-	u32 nodes;
+	u32 nodes, cores_per_cu;
 	u8 node_id;
 	int cpu = smp_processor_id();
 
@@ -276,6 +276,7 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
 		/* get compute unit information */
 		smp_num_siblings = ((ebx >> 8) & 3) + 1;
 		c->compute_unit_id = ebx & 0xff;
+		cores_per_cu = ((ebx >> 8) & 3) + 1;
 	} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
 		u64 value;
 
@@ -288,15 +289,18 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
 	/* fixup multi-node processor information */
 	if (nodes > 1) {
 		u32 cores_per_node;
+		u32 cus_per_node;
 
 		set_cpu_cap(c, X86_FEATURE_AMD_DCM);
 		cores_per_node = c->x86_max_cores / nodes;
+		cus_per_node = cores_per_node / cores_per_cu;
 
 		/* store NodeID, use llc_shared_map to store sibling info */
 		per_cpu(cpu_llc_id, cpu) = node_id;
 
 		/* core id to be in range from 0 to (cores_per_node - 1) */
-		c->cpu_core_id = c->cpu_core_id % cores_per_node;
+		c->cpu_core_id %= cores_per_node;
+		c->compute_unit_id %= cus_per_node;
 	}
 }
 #endif
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 0cbe8c0b35ed..fbaa2229af5b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -414,6 +414,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
 
 			if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
 				if (c->phys_proc_id == o->phys_proc_id &&
+				    per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) &&
 				    c->compute_unit_id == o->compute_unit_id)
 					link_thread_siblings(cpu, i);
 			} else if (c->phys_proc_id == o->phys_proc_id &&
-- 
cgit v1.2.3


From b453de02b786c63b8928ec822401468131db0a9b Mon Sep 17 00:00:00 2001
From: Hans Rosenfeld <hans.rosenfeld@amd.com>
Date: Mon, 24 Jan 2011 16:05:41 +0100
Subject: x86, amd: Enable L3 cache index disable on family 0x15

AMD family 0x15 CPUs support L3 cache index disable, so enable
it on them.

Signed-off-by: Hans Rosenfeld <hans.rosenfeld@amd.com>
Cc: <andreas.herrmann3@amd.com>
LKML-Reference: <1295881543-572552-3-git-send-email-hans.rosenfeld@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/amd_nb.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 0a99f7198bc3..a4f394c8e055 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -85,6 +85,9 @@ int amd_cache_northbridges(void)
 	     boot_cpu_data.x86_mask >= 0x1))
 		amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
 
+	if (boot_cpu_data.x86 == 0x15)
+		amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(amd_cache_northbridges);
-- 
cgit v1.2.3


From 41b2610c3443e6c4760e61fc10eef73f96f9f6a5 Mon Sep 17 00:00:00 2001
From: Hans Rosenfeld <hans.rosenfeld@amd.com>
Date: Mon, 24 Jan 2011 16:05:42 +0100
Subject: x86, amd: Extend AMD northbridge caching code to support "Link
 Control" devices

"Link Control" devices (NB function 4) will be used by L3 cache
partitioning on family 0x15.

Signed-off-by: Hans Rosenfeld <hans.rosenfeld@amd.com>
Cc: <andreas.herrmann3@amd.com>
LKML-Reference: <1295881543-572552-4-git-send-email-hans.rosenfeld@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/amd_nb.h |  1 +
 arch/x86/kernel/amd_nb.c      | 11 +++++++++--
 include/linux/pci_ids.h       |  1 +
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 64dc82ee19f0..3e7070071d73 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -26,6 +26,7 @@ extern void amd_get_nodes(struct bootnode *nodes);
 
 struct amd_northbridge {
 	struct pci_dev *misc;
+	struct pci_dev *link;
 };
 
 struct amd_northbridge_info {
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index a4f394c8e055..4ae9a961c33c 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -20,6 +20,11 @@ struct pci_device_id amd_nb_misc_ids[] = {
 };
 EXPORT_SYMBOL(amd_nb_misc_ids);
 
+static struct pci_device_id amd_nb_link_ids[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_LINK) },
+	{}
+};
+
 const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = {
 	{ 0x00, 0x18, 0x20 },
 	{ 0xff, 0x00, 0x20 },
@@ -45,7 +50,7 @@ int amd_cache_northbridges(void)
 {
 	int i = 0;
 	struct amd_northbridge *nb;
-	struct pci_dev *misc;
+	struct pci_dev *misc, *link;
 
 	if (amd_nb_num())
 		return 0;
@@ -64,10 +69,12 @@ int amd_cache_northbridges(void)
 	amd_northbridges.nb = nb;
 	amd_northbridges.num = i;
 
-	misc = NULL;
+	link = misc = NULL;
 	for (i = 0; i != amd_nb_num(); i++) {
 		node_to_amd_nb(i)->misc = misc =
 			next_northbridge(misc, amd_nb_misc_ids);
+		node_to_amd_nb(i)->link = link =
+			next_northbridge(link, amd_nb_link_ids);
         }
 
 	/* some CPU families (e.g. family 0x11) do not support GART */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 3adb06ebf841..580de67f318b 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -518,6 +518,7 @@
 #define PCI_DEVICE_ID_AMD_11H_NB_MISC	0x1303
 #define PCI_DEVICE_ID_AMD_11H_NB_LINK	0x1304
 #define PCI_DEVICE_ID_AMD_15H_NB_MISC	0x1603
+#define PCI_DEVICE_ID_AMD_15H_NB_LINK	0x1604
 #define PCI_DEVICE_ID_AMD_CNB17H_F3	0x1703
 #define PCI_DEVICE_ID_AMD_LANCE		0x2000
 #define PCI_DEVICE_ID_AMD_LANCE_HOME	0x2001
-- 
cgit v1.2.3


From b3d7336db553d318e7ec042eb50a70d307013339 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 21 Jan 2011 15:29:44 -0800
Subject: x86: Move llc_shared_map out of cpu_info

cpu_info is already with per_cpu, We can take llc_shared_map out
of cpu_info, and declare it as per_cpu variable directly.

So later referencing could be simple and directly instead of
diving to find cpu_info at first.

Also could make smp_store_cpu_info() much simple to avoid to do
save and restore trick.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Hans Rosenfeld <hans.rosenfeld@amd.com>
Cc: Alok N Kataria <akataria@vmware.com>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: Hans J. Koch <hjk@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Borislav Petkov <borislav.petkov@amd.com>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <4D3A16E8.5020608@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/processor.h      |  4 ----
 arch/x86/include/asm/smp.h            |  7 +++++++
 arch/x86/kernel/cpu/intel_cacheinfo.c |  4 ++--
 arch/x86/kernel/cpu/mcheck/mce_amd.c  |  7 ++-----
 arch/x86/kernel/smpboot.c             | 38 +++++++++--------------------------
 5 files changed, 21 insertions(+), 39 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 45636cefa186..4c25ab48257b 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -94,10 +94,6 @@ struct cpuinfo_x86 {
 	int			x86_cache_alignment;	/* In bytes */
 	int			x86_power;
 	unsigned long		loops_per_jiffy;
-#ifdef CONFIG_SMP
-	/* cpus sharing the last level cache: */
-	cpumask_var_t		llc_shared_map;
-#endif
 	/* cpuid returned max cores value: */
 	u16			 x86_max_cores;
 	u16			apicid;
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4c2f63c7fc1b..3597825a3112 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -23,6 +23,8 @@ extern unsigned int num_processors;
 
 DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map);
 DECLARE_PER_CPU(cpumask_var_t, cpu_core_map);
+/* cpus sharing the last level cache: */
+DECLARE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
 DECLARE_PER_CPU(u16, cpu_llc_id);
 DECLARE_PER_CPU(int, cpu_number);
 
@@ -36,6 +38,11 @@ static inline struct cpumask *cpu_core_mask(int cpu)
 	return per_cpu(cpu_core_map, cpu);
 }
 
+static inline struct cpumask *cpu_llc_shared_mask(int cpu)
+{
+	return per_cpu(cpu_llc_shared_map, cpu);
+}
+
 DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
 DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
 
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index ec2c19a7b8ef..5419a263ebd1 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -732,11 +732,11 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 
 	if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
-		for_each_cpu(i, c->llc_shared_map) {
+		for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
 			if (!per_cpu(ici_cpuid4_info, i))
 				continue;
 			this_leaf = CPUID4_INFO_IDX(i, index);
-			for_each_cpu(sibling, c->llc_shared_map) {
+			for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
 				if (!cpu_online(sibling))
 					continue;
 				set_bit(sibling, this_leaf->shared_cpu_map);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 5bf2fac52aca..167f97b5596e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -527,15 +527,12 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 	int i, err = 0;
 	struct threshold_bank *b = NULL;
 	char name[32];
-#ifdef CONFIG_SMP
-	struct cpuinfo_x86 *c = &cpu_data(cpu);
-#endif
 
 	sprintf(name, "threshold_bank%i", bank);
 
 #ifdef CONFIG_SMP
 	if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) {	/* symlink */
-		i = cpumask_first(c->llc_shared_map);
+		i = cpumask_first(cpu_llc_shared_mask(cpu));
 
 		/* first core not up yet */
 		if (cpu_data(i).cpu_core_id)
@@ -555,7 +552,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		if (err)
 			goto out;
 
-		cpumask_copy(b->cpus, c->llc_shared_map);
+		cpumask_copy(b->cpus, cpu_llc_shared_mask(cpu));
 		per_cpu(threshold_banks, cpu)[bank] = b;
 
 		goto out;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 0cbe8c0b35ed..d396155f436c 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -130,6 +130,8 @@ EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
 EXPORT_PER_CPU_SYMBOL(cpu_core_map);
 
+DEFINE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
+
 /* Per CPU bogomips and other parameters */
 DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
 EXPORT_PER_CPU_SYMBOL(cpu_info);
@@ -355,23 +357,6 @@ notrace static void __cpuinit start_secondary(void *unused)
 	cpu_idle();
 }
 
-#ifdef CONFIG_CPUMASK_OFFSTACK
-/* In this case, llc_shared_map is a pointer to a cpumask. */
-static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
-				    const struct cpuinfo_x86 *src)
-{
-	struct cpumask *llc = dst->llc_shared_map;
-	*dst = *src;
-	dst->llc_shared_map = llc;
-}
-#else
-static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
-				    const struct cpuinfo_x86 *src)
-{
-	*dst = *src;
-}
-#endif /* CONFIG_CPUMASK_OFFSTACK */
-
 /*
  * The bootstrap kernel entry code has set these up. Save them for
  * a given CPU
@@ -381,7 +366,7 @@ void __cpuinit smp_store_cpu_info(int id)
 {
 	struct cpuinfo_x86 *c = &cpu_data(id);
 
-	copy_cpuinfo_x86(c, &boot_cpu_data);
+	*c = boot_cpu_data;
 	c->cpu_index = id;
 	if (id != 0)
 		identify_secondary_cpu(c);
@@ -389,15 +374,12 @@ void __cpuinit smp_store_cpu_info(int id)
 
 static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
 {
-	struct cpuinfo_x86 *c1 = &cpu_data(cpu1);
-	struct cpuinfo_x86 *c2 = &cpu_data(cpu2);
-
 	cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
 	cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
 	cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
 	cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
-	cpumask_set_cpu(cpu1, c2->llc_shared_map);
-	cpumask_set_cpu(cpu2, c1->llc_shared_map);
+	cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2));
+	cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1));
 }
 
 
@@ -425,7 +407,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
 		cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
 	}
 
-	cpumask_set_cpu(cpu, c->llc_shared_map);
+	cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
 
 	if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
 		cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
@@ -436,8 +418,8 @@ void __cpuinit set_cpu_sibling_map(int cpu)
 	for_each_cpu(i, cpu_sibling_setup_mask) {
 		if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
 		    per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
-			cpumask_set_cpu(i, c->llc_shared_map);
-			cpumask_set_cpu(cpu, cpu_data(i).llc_shared_map);
+			cpumask_set_cpu(i, cpu_llc_shared_mask(cpu));
+			cpumask_set_cpu(cpu, cpu_llc_shared_mask(i));
 		}
 		if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
 			cpumask_set_cpu(i, cpu_core_mask(cpu));
@@ -476,7 +458,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
 	    !(cpu_has(c, X86_FEATURE_AMD_DCM)))
 		return cpu_core_mask(cpu);
 	else
-		return c->llc_shared_map;
+		return cpu_llc_shared_mask(cpu);
 }
 
 static void impress_friends(void)
@@ -1103,7 +1085,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 	for_each_possible_cpu(i) {
 		zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
 		zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
-		zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL);
+		zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
 	}
 	set_cpu_sibling_map(0);
 
-- 
cgit v1.2.3


From 792363d2beceb1c7d865e517fa9939c8b8c1442a Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 21 Jan 2011 15:29:54 -0800
Subject: x86: Don't copy per_cpu cpuinfo for BSP two times

smp_store_cpu_info(0) will do that.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
LKML-Reference: <4D3A16F2.5090902@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smpboot.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index d396155f436c..a7a3d1acc632 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1071,13 +1071,13 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 
 	preempt_disable();
 	smp_cpu_index_default();
-	memcpy(__this_cpu_ptr(&cpu_info), &boot_cpu_data, sizeof(cpu_info));
-	cpumask_copy(cpu_callin_mask, cpumask_of(0));
-	mb();
+
 	/*
 	 * Setup boot CPU information
 	 */
 	smp_store_cpu_info(0); /* Final full version of the data */
+	cpumask_copy(cpu_callin_mask, cpumask_of(0));
+	mb();
 #ifdef CONFIG_X86_32
 	boot_cpu_logical_apicid = logical_smp_processor_id();
 #endif
-- 
cgit v1.2.3


From 6d5ab2932a21ea54406ab95c43ecff90a3eddfda Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Fri, 21 Jan 2011 20:45:01 -0800
Subject: sched: Simplify update_cfs_shares parameters

Re-visiting this: Since update_cfs_shares will now only ever re-weight an
entity that is a relative parent of the current entity in enqueue_entity; we
can safely issue the account_entity_enqueue relative to that cfs_rq and avoid
the requirement for special handling of the enqueue case in update_cfs_shares.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110122044851.915214637@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |  2 +-
 kernel/sched_fair.c | 30 ++++++++++++++----------------
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 18d38e4ec7ba..e0fa3ff7f194 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8510,7 +8510,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 		/* Propagate contribution to hierarchy */
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		for_each_sched_entity(se)
-			update_cfs_shares(group_cfs_rq(se), 0);
+			update_cfs_shares(group_cfs_rq(se));
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 	}
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0c26e2df450e..0c550c841eee 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -540,7 +540,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
-static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
+static void update_cfs_shares(struct cfs_rq *cfs_rq);
 
 /*
  * Update the current task's runtime statistics. Skip current tasks that
@@ -763,16 +763,15 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 		list_del_leaf_cfs_rq(cfs_rq);
 }
 
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
-				long weight_delta)
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 {
 	long load_weight, load, shares;
 
-	load = cfs_rq->load.weight + weight_delta;
+	load = cfs_rq->load.weight;
 
 	load_weight = atomic_read(&tg->load_weight);
-	load_weight -= cfs_rq->load_contribution;
 	load_weight += load;
+	load_weight -= cfs_rq->load_contribution;
 
 	shares = (tg->shares * load);
 	if (load_weight)
@@ -790,7 +789,7 @@ static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
 {
 	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
 		update_cfs_load(cfs_rq, 0);
-		update_cfs_shares(cfs_rq, 0);
+		update_cfs_shares(cfs_rq);
 	}
 }
 # else /* CONFIG_SMP */
@@ -798,8 +797,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 {
 }
 
-static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
-				long weight_delta)
+static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 {
 	return tg->shares;
 }
@@ -824,7 +822,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 		account_entity_enqueue(cfs_rq, se);
 }
 
-static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+static void update_cfs_shares(struct cfs_rq *cfs_rq)
 {
 	struct task_group *tg;
 	struct sched_entity *se;
@@ -838,7 +836,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
 	if (likely(se->load.weight == tg->shares))
 		return;
 #endif
-	shares = calc_cfs_shares(cfs_rq, tg, weight_delta);
+	shares = calc_cfs_shares(cfs_rq, tg);
 
 	reweight_entity(cfs_rq_of(se), se, shares);
 }
@@ -847,7 +845,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 {
 }
 
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 {
 }
 
@@ -978,8 +976,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 */
 	update_curr(cfs_rq);
 	update_cfs_load(cfs_rq, 0);
-	update_cfs_shares(cfs_rq, se->load.weight);
 	account_entity_enqueue(cfs_rq, se);
+	update_cfs_shares(cfs_rq);
 
 	if (flags & ENQUEUE_WAKEUP) {
 		place_entity(cfs_rq, se, 0);
@@ -1041,7 +1039,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	update_cfs_load(cfs_rq, 0);
 	account_entity_dequeue(cfs_rq, se);
 	update_min_vruntime(cfs_rq);
-	update_cfs_shares(cfs_rq, 0);
+	update_cfs_shares(cfs_rq);
 
 	/*
 	 * Normalize the entity after updating the min_vruntime because the
@@ -1282,7 +1280,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 		update_cfs_load(cfs_rq, 0);
-		update_cfs_shares(cfs_rq, 0);
+		update_cfs_shares(cfs_rq);
 	}
 
 	hrtick_update(rq);
@@ -1312,7 +1310,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 		update_cfs_load(cfs_rq, 0);
-		update_cfs_shares(cfs_rq, 0);
+		update_cfs_shares(cfs_rq);
 	}
 
 	hrtick_update(rq);
@@ -2123,7 +2121,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
 	 * We need to update shares after updating tg->load_weight in
 	 * order to adjust the weight of groups with long running tasks.
 	 */
-	update_cfs_shares(cfs_rq, 0);
+	update_cfs_shares(cfs_rq);
 
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 
-- 
cgit v1.2.3


From f07333bf6ee66d9b49286cec4371cf375e745b7a Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Fri, 21 Jan 2011 20:45:03 -0800
Subject: sched: Avoid expensive initial update_cfs_load()

Since cfs->{load_stamp,load_last} are zero-initalized the initial load update
will consider the delta to be 'since the beginning of time'.

This results in a lot of pointless divisions to bring this large period to be
within the sysctl_sched_shares_window.

Fix this by initializing load_stamp to be 1 at cfs_rq initialization, this
allows for an initial load_stamp > load_last which then lets standard idle
truncation proceed.

We avoid spinning (and slightly improve consistency) by fixing delta to be
[period - 1] in this path resulting in a slightly more predictable shares ramp.
(Previously the amount of idle time preserved by the overflow would range between
[period/2,period-1].)

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110122044852.102126037@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 2 ++
 kernel/sched_fair.c | 1 +
 2 files changed, 3 insertions(+)

diff --git a/kernel/sched.c b/kernel/sched.c
index e0fa3ff7f194..6820b5b3a969 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7796,6 +7796,8 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
 	INIT_LIST_HEAD(&cfs_rq->tasks);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	cfs_rq->rq = rq;
+	/* allow initial update_cfs_load() to truncate */
+	cfs_rq->load_stamp = 1;
 #endif
 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0c550c841eee..4cbc9121094c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -733,6 +733,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 	    now - cfs_rq->load_last > 4 * period) {
 		cfs_rq->load_period = 0;
 		cfs_rq->load_avg = 0;
+		delta = period - 1;
 	}
 
 	cfs_rq->load_stamp = now;
-- 
cgit v1.2.3


From 4dd53d891ca46dcc1fde0376a33540d3fd83cb9a Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Tue, 21 Dec 2010 17:09:00 -0800
Subject: softirqs: Free up pf flag PF_KSOFTIRQD

Cleanup patch, freeing up PF_KSOFTIRQD and use per_cpu ksoftirqd pointer
instead, as suggested by Eric Dumazet.

Tested-by: Shaun Ruffell <sruffell@digium.com>
Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1292980144-28796-2-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/interrupt.h | 7 +++++++
 include/linux/sched.h     | 1 -
 kernel/sched.c            | 2 +-
 kernel/softirq.c          | 3 +--
 4 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 55e0d4253e49..a1382b9b5813 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -426,6 +426,13 @@ extern void raise_softirq(unsigned int nr);
  */
 DECLARE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
 
+DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
+
+static inline struct task_struct *this_cpu_ksoftirqd(void)
+{
+	return this_cpu_read(ksoftirqd);
+}
+
 /* Try to send a softirq to a remote cpu.  If this cannot be done, the
  * work will be queued to the local cpu.
  */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d747f948b34e..af6e15fbfb78 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1715,7 +1715,6 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 /*
  * Per process flags
  */
-#define PF_KSOFTIRQD	0x00000001	/* I am ksoftirqd */
 #define PF_STARTING	0x00000002	/* being created */
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
diff --git a/kernel/sched.c b/kernel/sched.c
index 6820b5b3a969..8b89b3bba565 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1880,7 +1880,7 @@ void account_system_vtime(struct task_struct *curr)
 	 */
 	if (hardirq_count())
 		__this_cpu_add(cpu_hardirq_time, delta);
-	else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
 		__this_cpu_add(cpu_softirq_time, delta);
 
 	irq_time_write_end();
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 68eb5efec388..0cee50487629 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_stat);
 
 static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
 
-static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
 
 char *softirq_to_name[NR_SOFTIRQS] = {
 	"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
@@ -721,7 +721,6 @@ static int run_ksoftirqd(void * __bind_cpu)
 {
 	set_current_state(TASK_INTERRUPTIBLE);
 
-	current->flags |= PF_KSOFTIRQD;
 	while (!kthread_should_stop()) {
 		preempt_disable();
 		if (!local_softirq_pending()) {
-- 
cgit v1.2.3


From a1dabb6bfffccb897eff3e1d102dacf2a4bedf3b Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Tue, 21 Dec 2010 17:09:01 -0800
Subject: time: Add nsecs_to_cputime64 interface for asm-generic

Add nsecs_to_cputime64 interface. This is used in following patches that
updates cpu irq stat based on ns granularity info in IRQ_TIME_ACCOUNTING.

Tested-by: Shaun Ruffell <sruffell@digium.com>
Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1292980144-28796-3-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-generic/cputime.h |  3 +++
 include/linux/jiffies.h       |  1 +
 kernel/time.c                 | 23 +++++++++++++++++++++--
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h
index 2bcc5c7c22a6..61e03dd7939e 100644
--- a/include/asm-generic/cputime.h
+++ b/include/asm-generic/cputime.h
@@ -30,6 +30,9 @@ typedef u64 cputime64_t;
 #define cputime64_to_jiffies64(__ct)	(__ct)
 #define jiffies64_to_cputime64(__jif)	(__jif)
 #define cputime_to_cputime64(__ct)	((u64) __ct)
+#define cputime64_gt(__a, __b)		((__a) >  (__b))
+
+#define nsecs_to_cputime64(__ct)	nsecs_to_jiffies64(__ct)
 
 
 /*
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 6811f4bfc6e7..922aa313c9f9 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -307,6 +307,7 @@ extern clock_t jiffies_to_clock_t(long x);
 extern unsigned long clock_t_to_jiffies(unsigned long x);
 extern u64 jiffies_64_to_clock_t(u64 x);
 extern u64 nsec_to_clock_t(u64 x);
+extern u64 nsecs_to_jiffies64(u64 n);
 extern unsigned long nsecs_to_jiffies(u64 n);
 
 #define TIMESTAMP_SIZE	30
diff --git a/kernel/time.c b/kernel/time.c
index 32174359576f..55337a816b20 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -645,7 +645,7 @@ u64 nsec_to_clock_t(u64 x)
 }
 
 /**
- * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
+ * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
  *
  * @n:	nsecs in u64
  *
@@ -657,7 +657,7 @@ u64 nsec_to_clock_t(u64 x)
  *   NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
  *   ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
  */
-unsigned long nsecs_to_jiffies(u64 n)
+u64 nsecs_to_jiffies64(u64 n)
 {
 #if (NSEC_PER_SEC % HZ) == 0
 	/* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
@@ -674,6 +674,25 @@ unsigned long nsecs_to_jiffies(u64 n)
 #endif
 }
 
+
+/**
+ * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
+ *
+ * @n:	nsecs in u64
+ *
+ * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
+ * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
+ * for scheduler, not for use in device drivers to calculate timeout value.
+ *
+ * note:
+ *   NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
+ *   ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
+ */
+unsigned long nsecs_to_jiffies(u64 n)
+{
+	return (unsigned long)nsecs_to_jiffies64(n);
+}
+
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void)
 {
-- 
cgit v1.2.3


From 70a89a6620f658d47a1488515bada4b8ee6291d8 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Tue, 21 Dec 2010 17:09:02 -0800
Subject: sched: Refactor account_system_time separating id-update

Refactor account_system_time, to separate out the logic of
identifying the update needed and code that does actual update.

This is used by following patch for IRQ_TIME_ACCOUNTING,
which has different identification logic and same update logic.

Tested-by: Shaun Ruffell <sruffell@digium.com>
Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1292980144-28796-4-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 46 +++++++++++++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 8b89b3bba565..e3fa92106ed7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3567,6 +3567,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
 	}
 }
 
+/*
+ * Account system cpu time to a process and desired cpustat field
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * @target_cputime64: pointer to cpustat field that has to be updated
+ */
+static inline
+void __account_system_time(struct task_struct *p, cputime_t cputime,
+			cputime_t cputime_scaled, cputime64_t *target_cputime64)
+{
+	cputime64_t tmp = cputime_to_cputime64(cputime);
+
+	/* Add system time to process. */
+	p->stime = cputime_add(p->stime, cputime);
+	p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
+	account_group_system_time(p, cputime);
+
+	/* Add system time to cpustat. */
+	*target_cputime64 = cputime64_add(*target_cputime64, tmp);
+	cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+
+	/* Account for system time used */
+	acct_update_integrals(p);
+}
+
 /*
  * Account system cpu time to a process.
  * @p: the process that the cpu time gets accounted to
@@ -3578,31 +3604,21 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 			 cputime_t cputime, cputime_t cputime_scaled)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-	cputime64_t tmp;
+	cputime64_t *target_cputime64;
 
 	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
 		account_guest_time(p, cputime, cputime_scaled);
 		return;
 	}
 
-	/* Add system time to process. */
-	p->stime = cputime_add(p->stime, cputime);
-	p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
-	account_group_system_time(p, cputime);
-
-	/* Add system time to cpustat. */
-	tmp = cputime_to_cputime64(cputime);
 	if (hardirq_count() - hardirq_offset)
-		cpustat->irq = cputime64_add(cpustat->irq, tmp);
+		target_cputime64 = &cpustat->irq;
 	else if (in_serving_softirq())
-		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+		target_cputime64 = &cpustat->softirq;
 	else
-		cpustat->system = cputime64_add(cpustat->system, tmp);
+		target_cputime64 = &cpustat->system;
 
-	cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
-
-	/* Account for system time used */
-	acct_update_integrals(p);
+	__account_system_time(p, cputime, cputime_scaled, target_cputime64);
 }
 
 /*
-- 
cgit v1.2.3


From abb74cefa9c682fb38ba86c17ca3c86fed6cc464 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Tue, 21 Dec 2010 17:09:03 -0800
Subject: sched: Export ns irqtimes through /proc/stat

CONFIG_IRQ_TIME_ACCOUNTING adds ns granularity irq time on each CPU.
This info is already used in scheduler to do proper task chargeback
(earlier patches). This patch retro-fits this ns granularity
hardirq and softirq information to /proc/stat irq and softirq fields.

The update is still done on timer tick, where we look at accumulated
ns hardirq/softirq time and account the tick to user/system/irq/hardirq/guest
accordingly.

No new interface added.

Earlier versions looked at adding this as new fields in some /proc
files. This one seems to be the best in terms of impact to existing
apps, even though it has somewhat more kernel code than earlier versions.

Tested-by: Shaun Ruffell <sruffell@digium.com>
Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1292980144-28796-5-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)

diff --git a/kernel/sched.c b/kernel/sched.c
index e3fa92106ed7..2a3c9799d76b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1920,8 +1920,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 		sched_rt_avg_update(rq, irq_delta);
 }
 
+static int irqtime_account_hi_update(void)
+{
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	unsigned long flags;
+	u64 latest_ns;
+	int ret = 0;
+
+	local_irq_save(flags);
+	latest_ns = this_cpu_read(cpu_hardirq_time);
+	if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
+		ret = 1;
+	local_irq_restore(flags);
+	return ret;
+}
+
+static int irqtime_account_si_update(void)
+{
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	unsigned long flags;
+	u64 latest_ns;
+	int ret = 0;
+
+	local_irq_save(flags);
+	latest_ns = this_cpu_read(cpu_softirq_time);
+	if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
+		ret = 1;
+	local_irq_restore(flags);
+	return ret;
+}
+
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
 
+#define sched_clock_irqtime	(0)
+
 static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
 	rq->clock_task += delta;
@@ -3621,6 +3653,65 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 	__account_system_time(p, cputime, cputime_scaled, target_cputime64);
 }
 
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * Account a tick to a process and cpustat
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: is the tick from userspace
+ * @rq: the pointer to rq
+ *
+ * Tick demultiplexing follows the order
+ * - pending hardirq update
+ * - pending softirq update
+ * - user_time
+ * - idle_time
+ * - system time
+ *   - check for guest_time
+ *   - else account as system_time
+ *
+ * Check for hardirq is done both for system and user time as there is
+ * no timer going off while we are on hardirq and hence we may never get an
+ * opportunity to update it solely in system time.
+ * p->stime and friends are only updated on system time and not on irq
+ * softirq as those do not count in task exec_runtime any more.
+ */
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+						struct rq *rq)
+{
+	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+	cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+
+	if (irqtime_account_hi_update()) {
+		cpustat->irq = cputime64_add(cpustat->irq, tmp);
+	} else if (irqtime_account_si_update()) {
+		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+	} else if (user_tick) {
+		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+	} else if (p == rq->idle) {
+		account_idle_time(cputime_one_jiffy);
+	} else if (p->flags & PF_VCPU) { /* System time or guest time */
+		account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+	} else {
+		__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+					&cpustat->system);
+	}
+}
+
+static void irqtime_account_idle_ticks(int ticks)
+{
+	int i;
+	struct rq *rq = this_rq();
+
+	for (i = 0; i < ticks; i++)
+		irqtime_account_process_tick(current, 0, rq);
+}
+#else
+static void irqtime_account_idle_ticks(int ticks) {}
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+						struct rq *rq) {}
+#endif
+
 /*
  * Account for involuntary wait time.
  * @steal: the cpu time spent in involuntary wait
@@ -3661,6 +3752,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
 	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
 	struct rq *rq = this_rq();
 
+	if (sched_clock_irqtime) {
+		irqtime_account_process_tick(p, user_tick, rq);
+		return;
+	}
+
 	if (user_tick)
 		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
 	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3686,6 +3782,12 @@ void account_steal_ticks(unsigned long ticks)
  */
 void account_idle_ticks(unsigned long ticks)
 {
+
+	if (sched_clock_irqtime) {
+		irqtime_account_idle_ticks(ticks);
+		return;
+	}
+
 	account_idle_time(jiffies_to_cputime(ticks));
 }
 
-- 
cgit v1.2.3


From 414bee9ba613adb3804965e2d84db32d0599f9c6 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Tue, 21 Dec 2010 17:09:04 -0800
Subject: softirqs: Account ksoftirqd time as cpustat softirq

softirq time in ksoftirqd context is not accounted in ns granularity
per cpu softirq stats, as we want that to be a part of ksoftirqd
exec_runtime.

Accounting them as softirq on /proc/stat separately.

Tested-by: Shaun Ruffell <sruffell@digium.com>
Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1292980144-28796-6-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/kernel/sched.c b/kernel/sched.c
index 2a3c9799d76b..8b718b59b09f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3686,6 +3686,14 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 		cpustat->irq = cputime64_add(cpustat->irq, tmp);
 	} else if (irqtime_account_si_update()) {
 		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+	} else if (this_cpu_ksoftirqd() == p) {
+		/*
+		 * ksoftirqd time do not get accounted in cpu_softirq_time.
+		 * So, we have to handle it separately here.
+		 * Also, p->stime needs to be updated for ksoftirqd.
+		 */
+		__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+					&cpustat->softirq);
 	} else if (user_tick) {
 		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
 	} else if (p == rq->idle) {
-- 
cgit v1.2.3


From a8941d7ec81678fb69aea7183338175f112f3e0d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 25 Jan 2011 16:30:03 +0100
Subject: sched: Simplify the idle scheduling class

Since commit 48c5ccae88dcd (sched: Simplify cpu-hot-unplug task
migration) this should no longer happen, so remove the code.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_idletask.c | 23 ++++-------------------
 1 file changed, 4 insertions(+), 19 deletions(-)

diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 9fa0f402c87c..41eb62a0808b 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -52,31 +52,16 @@ static void set_curr_task_idle(struct rq *rq)
 {
 }
 
-static void switched_to_idle(struct rq *rq, struct task_struct *p,
-			     int running)
+static void
+switched_to_idle(struct rq *rq, struct task_struct *p, int running)
 {
-	/* Can this actually happen?? */
-	if (running)
-		resched_task(rq->curr);
-	else
-		check_preempt_curr(rq, p, 0);
+	BUG();
 }
 
 static void prio_changed_idle(struct rq *rq, struct task_struct *p,
 			      int oldprio, int running)
 {
-	/* This can happen for hot plug CPUS */
-
-	/*
-	 * Reschedule if we are currently running on this runqueue and
-	 * our priority decreased, or if we are not currently running on
-	 * this runqueue and our priority is higher than the current's
-	 */
-	if (running) {
-		if (p->prio > oldprio)
-			resched_task(rq->curr);
-	} else
-		check_preempt_curr(rq, p, 0);
+	BUG();
 }
 
 static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
-- 
cgit v1.2.3


From da7a735e51f9622eb3e1672594d4a41da01d7e4f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 17 Jan 2011 17:03:27 +0100
Subject: sched: Fix switch_from_fair()

When a task is taken out of the fair class we must ensure the vruntime
is properly normalized because when we put it back in it will assume
to be normalized.

The case that goes wrong is when changing away from the fair class
while sleeping. Sleeping tasks have non-normalized vruntime in order
to make sleeper-fairness work. So treat the switch away from fair as a
wakeup and preserve the relative vruntime.

Also update sysrq-n to call the ->switch_{to,from} methods.

Reported-by: Onkalo Samu <samu.p.onkalo@nokia.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   |  8 +++-----
 kernel/sched.c          | 25 ++++++++++++++-----------
 kernel/sched_fair.c     | 42 ++++++++++++++++++++++++++++++++++++------
 kernel/sched_idletask.c |  7 +++----
 kernel/sched_rt.c       | 19 ++++++++++---------
 kernel/sched_stoptask.c |  7 +++----
 6 files changed, 69 insertions(+), 39 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index af6e15fbfb78..0542774914d4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1084,12 +1084,10 @@ struct sched_class {
 	void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
 	void (*task_fork) (struct task_struct *p);
 
-	void (*switched_from) (struct rq *this_rq, struct task_struct *task,
-			       int running);
-	void (*switched_to) (struct rq *this_rq, struct task_struct *task,
-			     int running);
+	void (*switched_from) (struct rq *this_rq, struct task_struct *task);
+	void (*switched_to) (struct rq *this_rq, struct task_struct *task);
 	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
-			     int oldprio, int running);
+			     int oldprio);
 
 	unsigned int (*get_rr_interval) (struct rq *rq,
 					 struct task_struct *task);
diff --git a/kernel/sched.c b/kernel/sched.c
index 8b718b59b09f..78fa75394011 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2057,14 +2057,14 @@ inline int task_curr(const struct task_struct *p)
 
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 				       const struct sched_class *prev_class,
-				       int oldprio, int running)
+				       int oldprio)
 {
 	if (prev_class != p->sched_class) {
 		if (prev_class->switched_from)
-			prev_class->switched_from(rq, p, running);
-		p->sched_class->switched_to(rq, p, running);
-	} else
-		p->sched_class->prio_changed(rq, p, oldprio, running);
+			prev_class->switched_from(rq, p);
+		p->sched_class->switched_to(rq, p);
+	} else if (oldprio != p->prio)
+		p->sched_class->prio_changed(rq, p, oldprio);
 }
 
 static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
@@ -2598,6 +2598,7 @@ static void __sched_fork(struct task_struct *p)
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.nr_migrations		= 0;
+	p->se.vruntime			= 0;
 
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -4696,11 +4697,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
-	if (on_rq) {
+	if (on_rq)
 		enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
 
-		check_class_changed(rq, p, prev_class, oldprio, running);
-	}
+	check_class_changed(rq, p, prev_class, oldprio);
 	task_rq_unlock(rq, &flags);
 }
 
@@ -5028,11 +5028,10 @@ recheck:
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
-	if (on_rq) {
+	if (on_rq)
 		activate_task(rq, p, 0);
 
-		check_class_changed(rq, p, prev_class, oldprio, running);
-	}
+	check_class_changed(rq, p, prev_class, oldprio);
 	__task_rq_unlock(rq);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
@@ -8237,6 +8236,8 @@ EXPORT_SYMBOL(__might_sleep);
 #ifdef CONFIG_MAGIC_SYSRQ
 static void normalize_task(struct rq *rq, struct task_struct *p)
 {
+	const struct sched_class *prev_class = p->sched_class;
+	int old_prio = p->prio;
 	int on_rq;
 
 	on_rq = p->se.on_rq;
@@ -8247,6 +8248,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
 		activate_task(rq, p, 0);
 		resched_task(rq->curr);
 	}
+
+	check_class_changed(rq, p, prev_class, old_prio);
 }
 
 void normalize_rt_tasks(void)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 4cbc9121094c..55040f3938d8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -4078,33 +4078,62 @@ static void task_fork_fair(struct task_struct *p)
  * Priority of the task has changed. Check to see if we preempt
  * the current task.
  */
-static void prio_changed_fair(struct rq *rq, struct task_struct *p,
-			      int oldprio, int running)
+static void
+prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
 {
+	if (!p->se.on_rq)
+		return;
+
 	/*
 	 * Reschedule if we are currently running on this runqueue and
 	 * our priority decreased, or if we are not currently running on
 	 * this runqueue and our priority is higher than the current's
 	 */
-	if (running) {
+	if (rq->curr == p) {
 		if (p->prio > oldprio)
 			resched_task(rq->curr);
 	} else
 		check_preempt_curr(rq, p, 0);
 }
 
+static void switched_from_fair(struct rq *rq, struct task_struct *p)
+{
+	struct sched_entity *se = &p->se;
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+	/*
+	 * Ensure the task's vruntime is normalized, so that when its
+	 * switched back to the fair class the enqueue_entity(.flags=0) will
+	 * do the right thing.
+	 *
+	 * If it was on_rq, then the dequeue_entity(.flags=0) will already
+	 * have normalized the vruntime, if it was !on_rq, then only when
+	 * the task is sleeping will it still have non-normalized vruntime.
+	 */
+	if (!se->on_rq && p->state != TASK_RUNNING) {
+		/*
+		 * Fix up our vruntime so that the current sleep doesn't
+		 * cause 'unlimited' sleep bonus.
+		 */
+		place_entity(cfs_rq, se, 0);
+		se->vruntime -= cfs_rq->min_vruntime;
+	}
+}
+
 /*
  * We switched to the sched_fair class.
  */
-static void switched_to_fair(struct rq *rq, struct task_struct *p,
-			     int running)
+static void switched_to_fair(struct rq *rq, struct task_struct *p)
 {
+	if (!p->se.on_rq)
+		return;
+
 	/*
 	 * We were most likely switched from sched_rt, so
 	 * kick off the schedule if running, otherwise just see
 	 * if we can still preempt the current task.
 	 */
-	if (running)
+	if (rq->curr == p)
 		resched_task(rq->curr);
 	else
 		check_preempt_curr(rq, p, 0);
@@ -4190,6 +4219,7 @@ static const struct sched_class fair_sched_class = {
 	.task_fork		= task_fork_fair,
 
 	.prio_changed		= prio_changed_fair,
+	.switched_from		= switched_from_fair,
 	.switched_to		= switched_to_fair,
 
 	.get_rr_interval	= get_rr_interval_fair,
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 41eb62a0808b..c82f26c1b7c3 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -52,14 +52,13 @@ static void set_curr_task_idle(struct rq *rq)
 {
 }
 
-static void
-switched_to_idle(struct rq *rq, struct task_struct *p, int running)
+static void switched_to_idle(struct rq *rq, struct task_struct *p)
 {
 	BUG();
 }
 
-static void prio_changed_idle(struct rq *rq, struct task_struct *p,
-			      int oldprio, int running)
+static void
+prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
 {
 	BUG();
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index c914ec747ca6..c381fdc18c64 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1595,8 +1595,7 @@ static void rq_offline_rt(struct rq *rq)
  * When switch from the rt queue, we bring ourselves to a position
  * that we might want to pull RT tasks from other runqueues.
  */
-static void switched_from_rt(struct rq *rq, struct task_struct *p,
-			   int running)
+static void switched_from_rt(struct rq *rq, struct task_struct *p)
 {
 	/*
 	 * If there are other RT tasks then we will reschedule
@@ -1605,7 +1604,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p,
 	 * we may need to handle the pulling of RT tasks
 	 * now.
 	 */
-	if (!rq->rt.rt_nr_running)
+	if (p->se.on_rq && !rq->rt.rt_nr_running)
 		pull_rt_task(rq);
 }
 
@@ -1624,8 +1623,7 @@ static inline void init_sched_rt_class(void)
  * with RT tasks. In this case we try to push them off to
  * other runqueues.
  */
-static void switched_to_rt(struct rq *rq, struct task_struct *p,
-			   int running)
+static void switched_to_rt(struct rq *rq, struct task_struct *p)
 {
 	int check_resched = 1;
 
@@ -1636,7 +1634,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
 	 * If that current running task is also an RT task
 	 * then see if we can move to another run queue.
 	 */
-	if (!running) {
+	if (p->se.on_rq && rq->curr != p) {
 #ifdef CONFIG_SMP
 		if (rq->rt.overloaded && push_rt_task(rq) &&
 		    /* Don't resched if we changed runqueues */
@@ -1652,10 +1650,13 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
  * Priority of the task has changed. This may cause
  * us to initiate a push or pull.
  */
-static void prio_changed_rt(struct rq *rq, struct task_struct *p,
-			    int oldprio, int running)
+static void
+prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
 {
-	if (running) {
+	if (!p->se.on_rq)
+		return;
+
+	if (rq->curr == p) {
 #ifdef CONFIG_SMP
 		/*
 		 * If our priority decreases while running, we
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 2bf6b47058c1..84ec9bcf82d9 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -59,14 +59,13 @@ static void set_curr_task_stop(struct rq *rq)
 {
 }
 
-static void switched_to_stop(struct rq *rq, struct task_struct *p,
-			     int running)
+static void switched_to_stop(struct rq *rq, struct task_struct *p)
 {
 	BUG(); /* its impossible to change to this class */
 }
 
-static void prio_changed_stop(struct rq *rq, struct task_struct *p,
-			      int oldprio, int running)
+static void
+prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
 {
 	BUG(); /* how!?, what priority? */
 }
-- 
cgit v1.2.3


From ccaa8d657117bb1876d471bd91579d774106778d Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 25 Jan 2011 23:17:32 +0100
Subject: rtmutex-tester: Remove BKL tests

The BKL is going away, no need to test it any more.
I left the definitions of the test case numbers
in, so that the other tests do not get renumbered.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1295993854-4971-19-git-send-email-arnd@arndb.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/rtmutex-tester.c | 39 ++++-----------------------------------
 1 file changed, 4 insertions(+), 35 deletions(-)

diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 66cb89bc5ef1..d5b543506cbc 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -9,7 +9,6 @@
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 #include <linux/sysdev.h>
 #include <linux/timer.h>
@@ -27,7 +26,6 @@ struct test_thread_data {
 	int			opcode;
 	int			opdata;
 	int			mutexes[MAX_RT_TEST_MUTEXES];
-	int			bkl;
 	int			event;
 	struct sys_device	sysdev;
 };
@@ -46,8 +44,8 @@ enum test_opcodes {
 	RTTEST_LOCKINTNOWAIT,	/* 6 Lock interruptible no wait in wakeup, data = lockindex */
 	RTTEST_LOCKCONT,	/* 7 Continue locking after the wakeup delay */
 	RTTEST_UNLOCK,		/* 8 Unlock, data = lockindex */
-	RTTEST_LOCKBKL,		/* 9 Lock BKL */
-	RTTEST_UNLOCKBKL,	/* 10 Unlock BKL */
+	RTTEST_LOCKBKL,		/* 9 Was: Lock BKL */
+	RTTEST_UNLOCKBKL,	/* 10 Was: Unlock BKL */
 	RTTEST_SIGNAL,		/* 11 Signal other test thread, data = thread id */
 	RTTEST_RESETEVENT = 98,	/* 98 Reset event counter */
 	RTTEST_RESET = 99,	/* 99 Reset all pending operations */
@@ -74,13 +72,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
 				td->mutexes[i] = 0;
 			}
 		}
-
-		if (!lockwakeup && td->bkl == 4) {
-#ifdef CONFIG_LOCK_KERNEL
-			unlock_kernel();
-#endif
-			td->bkl = 0;
-		}
 		return 0;
 
 	case RTTEST_RESETEVENT:
@@ -131,25 +122,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
 		td->mutexes[id] = 0;
 		return 0;
 
-	case RTTEST_LOCKBKL:
-		if (td->bkl)
-			return 0;
-		td->bkl = 1;
-#ifdef CONFIG_LOCK_KERNEL
-		lock_kernel();
-#endif
-		td->bkl = 4;
-		return 0;
-
-	case RTTEST_UNLOCKBKL:
-		if (td->bkl != 4)
-			break;
-#ifdef CONFIG_LOCK_KERNEL
-		unlock_kernel();
-#endif
-		td->bkl = 0;
-		return 0;
-
 	default:
 		break;
 	}
@@ -196,7 +168,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
 		td->event = atomic_add_return(1, &rttest_event);
 		break;
 
-	case RTTEST_LOCKBKL:
 	default:
 		break;
 	}
@@ -229,8 +200,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
 		td->event = atomic_add_return(1, &rttest_event);
 		return;
 
-	case RTTEST_LOCKBKL:
-		return;
 	default:
 		return;
 	}
@@ -380,11 +349,11 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
 	spin_lock(&rttest_lock);
 
 	curr += sprintf(curr,
-		"O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:",
+		"O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:",
 		td->opcode, td->event, tsk->state,
 			(MAX_RT_PRIO - 1) - tsk->prio,
 			(MAX_RT_PRIO - 1) - tsk->normal_prio,
-		tsk->pi_blocked_on, td->bkl);
+		tsk->pi_blocked_on);
 
 	for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
 		curr += sprintf(curr, "%d", td->mutexes[i]);
-- 
cgit v1.2.3


From 61bb46082775acd18c712607615a8b7dbeff7873 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 23 Jan 2011 15:10:51 +0100
Subject: alpha: Replace deprecated spinlock initialization

SPIN_LOCK_UNLOCK is deprecated. Use the lockdep capable variant
instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Matt Turner <mattst88@gmail.com>
---
 arch/alpha/include/asm/rwsem.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h
index 1570c0b54336..19c9cc7ed94b 100644
--- a/arch/alpha/include/asm/rwsem.h
+++ b/arch/alpha/include/asm/rwsem.h
@@ -39,7 +39,7 @@ struct rw_semaphore {
 };
 
 #define __RWSEM_INITIALIZER(name) \
-	{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \
+	{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock),	\
 	LIST_HEAD_INIT((name).wait_list) }
 
 #define DECLARE_RWSEM(name) \
-- 
cgit v1.2.3


From e41c8ab174f47ed5ed10a365482d0d7b0e352beb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 23 Jan 2011 15:14:15 +0100
Subject: cris: Replace deprecated spinlock initialization

SPIN_LOCK_UNLOCK is deprecated. Use the lockdep capable variant
instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
---
 arch/cris/arch-v32/kernel/smp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c
index 84fed3b4b079..4c9e3e1ba5d1 100644
--- a/arch/cris/arch-v32/kernel/smp.c
+++ b/arch/cris/arch-v32/kernel/smp.c
@@ -26,7 +26,9 @@
 #define FLUSH_ALL (void*)0xffffffff
 
 /* Vector of locks used for various atomic operations */
-spinlock_t cris_atomic_locks[] = { [0 ... LOCK_COUNT - 1] = SPIN_LOCK_UNLOCKED};
+spinlock_t cris_atomic_locks[] = {
+	[0 ... LOCK_COUNT - 1] = __SPIN_LOCK_UNLOCKED(cris_atomic_locks)
+};
 
 /* CPU masks */
 cpumask_t phys_cpu_present_map = CPU_MASK_NONE;
-- 
cgit v1.2.3


From 7424cdf77b1b2975d82619084f20f0055f715166 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 23 Jan 2011 15:16:35 +0100
Subject: mips: Replace deprecated spinlock initialization

SPIN_LOCK_UNLOCK is deprecated. Use the lockdep capable variant
instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Ralf Baechle <ralf@linux-mips.org>
---
 arch/mips/kernel/vpe.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/mips/kernel/vpe.c b/arch/mips/kernel/vpe.c
index 6a1fdfef8fde..ab52b7cf3b6b 100644
--- a/arch/mips/kernel/vpe.c
+++ b/arch/mips/kernel/vpe.c
@@ -148,9 +148,9 @@ struct {
 	spinlock_t tc_list_lock;
 	struct list_head tc_list;	/* Thread contexts */
 } vpecontrol = {
-	.vpe_list_lock	= SPIN_LOCK_UNLOCKED,
+	.vpe_list_lock	= __SPIN_LOCK_UNLOCKED(vpe_list_lock),
 	.vpe_list	= LIST_HEAD_INIT(vpecontrol.vpe_list),
-	.tc_list_lock	= SPIN_LOCK_UNLOCKED,
+	.tc_list_lock	= __SPIN_LOCK_UNLOCKED(tc_list_lock),
 	.tc_list	= LIST_HEAD_INIT(vpecontrol.tc_list)
 };
 
-- 
cgit v1.2.3


From 24774fbdeab8f6ac05a19e81bd645b0f7e5d2bb7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 23 Jan 2011 15:19:12 +0100
Subject: sparc: Replace deprecated spinlock initialization

SPIN_LOCK_UNLOCK is deprecated. Use the lockdep capable variant
instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/lib/atomic32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sparc/lib/atomic32.c b/arch/sparc/lib/atomic32.c
index cbddeb38ffda..d3c7a12ad879 100644
--- a/arch/sparc/lib/atomic32.c
+++ b/arch/sparc/lib/atomic32.c
@@ -16,7 +16,7 @@
 #define ATOMIC_HASH(a)	(&__atomic_hash[(((unsigned long)a)>>8) & (ATOMIC_HASH_SIZE-1)])
 
 spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] = {
-	[0 ... (ATOMIC_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED
+	[0 ... (ATOMIC_HASH_SIZE-1)] = __SPIN_LOCK_UNLOCKED(__atomic_hash)
 };
 
 #else /* SMP */
-- 
cgit v1.2.3


From 22e650045899011b028e40625bc73df9f3260bac Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 23 Jan 2011 15:21:25 +0100
Subject: um: Replace deprecated spinlock initialization

SPIN_LOCK_UNLOCK is deprecated. Use the lockdep capable variant
instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jeff Dike <jdike@addtoit.com>
---
 arch/um/drivers/ubd_kern.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index ba4a98ba39c0..620f5b70957d 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -185,7 +185,7 @@ struct ubd {
 	.no_cow =               0, \
 	.shared =		0, \
 	.cow =			DEFAULT_COW, \
-	.lock =			SPIN_LOCK_UNLOCKED,	\
+	.lock =			__SPIN_LOCK_UNLOCKED(ubd_devs.lock), \
 	.request =		NULL, \
 	.start_sg =		0, \
 	.end_sg =		0, \
-- 
cgit v1.2.3


From 235454c99851ba21038061b9acf38d1a636068c5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 23 Jan 2011 15:23:14 +0100
Subject: xtensa: Replace deprecated spinlock initialization

SPIN_LOCK_UNLOCK is deprecated. Use the lockdep capable variant
instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Zankel <chris@zankel.net>
---
 arch/xtensa/include/asm/rwsem.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h
index e39edf5c86f2..9d32f6874393 100644
--- a/arch/xtensa/include/asm/rwsem.h
+++ b/arch/xtensa/include/asm/rwsem.h
@@ -38,7 +38,7 @@ struct rw_semaphore {
 };
 
 #define __RWSEM_INITIALIZER(name) \
-	{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \
+	{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock),	\
 	  LIST_HEAD_INIT((name).wait_list) }
 
 #define DECLARE_RWSEM(name)		\
-- 
cgit v1.2.3


From 92578c0b8078f6919f9b47e7e16a1cf770bd127b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 23 Jan 2011 15:24:55 +0100
Subject: kthread: Replace deprecated spinlock initialization

SPIN_LOCK_UNLOCK is deprecated. Use the lockdep capable variant
instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/kthread.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index ce0775aa64c3..7ff16f7d3ed4 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -64,7 +64,7 @@ struct kthread_work {
 };
 
 #define KTHREAD_WORKER_INIT(worker)	{				\
-	.lock = SPIN_LOCK_UNLOCKED,					\
+	.lock = __SPIN_LOCK_UNLOCKED((worker).lock),			\
 	.work_list = LIST_HEAD_INIT((worker).work_list),		\
 	}
 
-- 
cgit v1.2.3


From 10389a15e25fd4784d42de7e0e3fc8c242f2011d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 23 Jan 2011 15:25:56 +0100
Subject: cred: Replace deprecated spinlock initialization

SPIN_LOCK_UNLOCK is deprecated. Use the lockdep capable variant
instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cred.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/cred.c b/kernel/cred.c
index 6a1aa004e376..b5496e81b0f7 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -35,7 +35,7 @@ static struct kmem_cache *cred_jar;
 static struct thread_group_cred init_tgcred = {
 	.usage	= ATOMIC_INIT(2),
 	.tgid	= 0,
-	.lock	= SPIN_LOCK_UNLOCKED,
+	.lock	= __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock),
 };
 #endif
 
-- 
cgit v1.2.3


From d04fa5a3ba06c3b7a1c4a6860d0fa4825507a755 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 23 Jan 2011 15:30:09 +0100
Subject: locking: Remove deprecated lock initializers

Last users are gone. Remove the left overs.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/spinlocks.txt    | 24 +-----------------------
 include/linux/rwlock_types.h   |  8 --------
 include/linux/spinlock_types.h |  8 --------
 scripts/checkpatch.pl          |  5 -----
 4 files changed, 1 insertion(+), 44 deletions(-)

diff --git a/Documentation/spinlocks.txt b/Documentation/spinlocks.txt
index 178c831b907d..2e3c64b1a6a5 100644
--- a/Documentation/spinlocks.txt
+++ b/Documentation/spinlocks.txt
@@ -86,7 +86,7 @@ to change the variables it has to get an exclusive write lock.
 
 The routines look the same as above:
 
-   rwlock_t xxx_lock = RW_LOCK_UNLOCKED;
+   rwlock_t xxx_lock = __RW_LOCK_UNLOCKED(xxx_lock);
 
 	unsigned long flags;
 
@@ -196,25 +196,3 @@ appropriate:
 
 For static initialization, use DEFINE_SPINLOCK() / DEFINE_RWLOCK() or
 __SPIN_LOCK_UNLOCKED() / __RW_LOCK_UNLOCKED() as appropriate.
-
-SPIN_LOCK_UNLOCKED and RW_LOCK_UNLOCKED are deprecated.  These interfere
-with lockdep state tracking.
-
-Most of the time, you can simply turn:
-	static spinlock_t xxx_lock = SPIN_LOCK_UNLOCKED;
-into:
-	static DEFINE_SPINLOCK(xxx_lock);
-
-Static structure member variables go from:
-
-	struct foo bar {
-		.lock	=	SPIN_LOCK_UNLOCKED;
-	};
-
-to:
-
-	struct foo bar {
-		.lock	=	__SPIN_LOCK_UNLOCKED(bar.lock);
-	};
-
-Declaration of static rw_locks undergo a similar transformation.
diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
index bd31808c7d8e..cc0072e93e36 100644
--- a/include/linux/rwlock_types.h
+++ b/include/linux/rwlock_types.h
@@ -43,14 +43,6 @@ typedef struct {
 				RW_DEP_MAP_INIT(lockname) }
 #endif
 
-/*
- * RW_LOCK_UNLOCKED defeat lockdep state tracking and is hence
- * deprecated.
- *
- * Please use DEFINE_RWLOCK() or __RW_LOCK_UNLOCKED() as appropriate.
- */
-#define RW_LOCK_UNLOCKED	__RW_LOCK_UNLOCKED(old_style_rw_init)
-
 #define DEFINE_RWLOCK(x)	rwlock_t x = __RW_LOCK_UNLOCKED(x)
 
 #endif /* __LINUX_RWLOCK_TYPES_H */
diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
index 851b7783720d..73548eb13a5d 100644
--- a/include/linux/spinlock_types.h
+++ b/include/linux/spinlock_types.h
@@ -81,14 +81,6 @@ typedef struct spinlock {
 #define __SPIN_LOCK_UNLOCKED(lockname) \
 	(spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
 
-/*
- * SPIN_LOCK_UNLOCKED defeats lockdep state tracking and is hence
- * deprecated.
- * Please use DEFINE_SPINLOCK() or __SPIN_LOCK_UNLOCKED() as
- * appropriate.
- */
-#define SPIN_LOCK_UNLOCKED	__SPIN_LOCK_UNLOCKED(old_style_spin_init)
-
 #define DEFINE_SPINLOCK(x)	spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
 
 #include <linux/rwlock_types.h>
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 4c0383da1c9a..58848e3e392c 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2654,11 +2654,6 @@ sub process {
 			WARN("Use of volatile is usually wrong: see Documentation/volatile-considered-harmful.txt\n" . $herecurr);
 		}
 
-# SPIN_LOCK_UNLOCKED & RW_LOCK_UNLOCKED are deprecated
-		if ($line =~ /\b(SPIN_LOCK_UNLOCKED|RW_LOCK_UNLOCKED)/) {
-			ERROR("Use of $1 is deprecated: see Documentation/spinlocks.txt\n" . $herecurr);
-		}
-
 # warn about #if 0
 		if ($line =~ /^.\s*\#\s*if\s+0\b/) {
 			CHK("if this code is redundant consider removing it\n" .
-- 
cgit v1.2.3


From c16a87ce063f79e0ec7d25ce2950e1bc6db03c72 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Jan 2011 20:05:50 +0000
Subject: rwsem: Cleanup includes

All rwsem implementations include the same headers. Include them from
include/linux/rwsem.h

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Matt Turner <mattst88@gmail.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Acked-by: David Miller <davem@davemloft.net>
Cc: Chris Zankel <chris@zankel.net>
LKML-Reference: <20110126195833.483520950@linutronix.de>
---
 arch/alpha/include/asm/rwsem.h   | 4 ----
 arch/ia64/include/asm/rwsem.h    | 3 ---
 arch/powerpc/include/asm/rwsem.h | 5 -----
 arch/s390/include/asm/rwsem.h    | 5 -----
 arch/sh/include/asm/rwsem.h      | 5 -----
 arch/sparc/include/asm/rwsem.h   | 6 ------
 arch/x86/include/asm/rwsem.h     | 6 ------
 arch/xtensa/include/asm/rwsem.h  | 6 ------
 include/linux/rwsem-spinlock.h   | 8 --------
 include/linux/rwsem.h            | 3 +++
 10 files changed, 3 insertions(+), 48 deletions(-)

diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h
index 19c9cc7ed94b..839a3fa20944 100644
--- a/arch/alpha/include/asm/rwsem.h
+++ b/arch/alpha/include/asm/rwsem.h
@@ -13,10 +13,6 @@
 #ifdef __KERNEL__
 
 #include <linux/compiler.h>
-#include <linux/list.h>
-#include <linux/spinlock.h>
-
-struct rwsem_waiter;
 
 extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h
index 215d5454c7d3..9bcf0792b01c 100644
--- a/arch/ia64/include/asm/rwsem.h
+++ b/arch/ia64/include/asm/rwsem.h
@@ -25,9 +25,6 @@
 #error "Please don't include <asm/rwsem.h> directly, use <linux/rwsem.h> instead."
 #endif
 
-#include <linux/list.h>
-#include <linux/spinlock.h>
-
 #include <asm/intrinsics.h>
 
 /*
diff --git a/arch/powerpc/include/asm/rwsem.h b/arch/powerpc/include/asm/rwsem.h
index 8447d89fbe72..c12abbe1d06f 100644
--- a/arch/powerpc/include/asm/rwsem.h
+++ b/arch/powerpc/include/asm/rwsem.h
@@ -13,11 +13,6 @@
  * by Paul Mackerras <paulus@samba.org>.
  */
 
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <asm/atomic.h>
-#include <asm/system.h>
-
 /*
  * the semaphore definition
  */
diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h
index 423fdda2322d..9cc8592b33bb 100644
--- a/arch/s390/include/asm/rwsem.h
+++ b/arch/s390/include/asm/rwsem.h
@@ -43,11 +43,6 @@
 
 #ifdef __KERNEL__
 
-#include <linux/list.h>
-#include <linux/spinlock.h>
-
-struct rwsem_waiter;
-
 extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *);
 extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *);
 extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h
index 06e2251a5e48..df6f34623c54 100644
--- a/arch/sh/include/asm/rwsem.h
+++ b/arch/sh/include/asm/rwsem.h
@@ -11,11 +11,6 @@
 #endif
 
 #ifdef __KERNEL__
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <asm/atomic.h>
-#include <asm/system.h>
-
 /*
  * the semaphore definition
  */
diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h
index a2b4302869bc..902d36bf150d 100644
--- a/arch/sparc/include/asm/rwsem.h
+++ b/arch/sparc/include/asm/rwsem.h
@@ -12,12 +12,6 @@
 #endif
 
 #ifdef __KERNEL__
-
-#include <linux/list.h>
-#include <linux/spinlock.h>
-
-struct rwsem_waiter;
-
 struct rw_semaphore {
 	signed long			count;
 #define RWSEM_UNLOCKED_VALUE		0x00000000L
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index d1e41b0f9b60..a626cff86041 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -37,14 +37,8 @@
 #endif
 
 #ifdef __KERNEL__
-
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <linux/lockdep.h>
 #include <asm/asm.h>
 
-struct rwsem_waiter;
-
 extern asmregparm struct rw_semaphore *
  rwsem_down_read_failed(struct rw_semaphore *sem);
 extern asmregparm struct rw_semaphore *
diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h
index 9d32f6874393..1be2102c648e 100644
--- a/arch/xtensa/include/asm/rwsem.h
+++ b/arch/xtensa/include/asm/rwsem.h
@@ -16,12 +16,6 @@
 #ifndef _LINUX_RWSEM_H
 #error "Please don't include <asm/rwsem.h> directly, use <linux/rwsem.h> instead."
 #endif
-
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <asm/atomic.h>
-#include <asm/system.h>
-
 /*
  * the semaphore definition
  */
diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h
index bdfcc2527970..8c0dc7fc07a4 100644
--- a/include/linux/rwsem-spinlock.h
+++ b/include/linux/rwsem-spinlock.h
@@ -12,15 +12,7 @@
 #error "please don't include linux/rwsem-spinlock.h directly, use linux/rwsem.h instead"
 #endif
 
-#include <linux/spinlock.h>
-#include <linux/list.h>
-
 #ifdef __KERNEL__
-
-#include <linux/types.h>
-
-struct rwsem_waiter;
-
 /*
  * the rw-semaphore definition
  * - if activity is 0 then there are no active readers or writers
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index efd348fe8ca7..496296d12d62 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -11,6 +11,9 @@
 
 #include <linux/types.h>
 #include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
 #include <asm/system.h>
 #include <asm/atomic.h>
 
-- 
cgit v1.2.3


From bde11efbc21ea84c3351464a422b467eaefabb9a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Jan 2011 20:05:53 +0000
Subject: x86: Cleanup rwsem_count_t typedef

Remove the typedef which has no real reason to be there.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: David Miller <davem@davemloft.net>
Cc: Chris Zankel <chris@zankel.net>
LKML-Reference: <20110126195833.580335506@linutronix.de>
---
 arch/x86/include/asm/rwsem.h | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index a626cff86041..c30206c2bbf9 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -68,10 +68,8 @@ extern asmregparm struct rw_semaphore *
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-typedef signed long rwsem_count_t;
-
 struct rw_semaphore {
-	rwsem_count_t		count;
+	long			count;
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -127,7 +125,7 @@ static inline void __down_read(struct rw_semaphore *sem)
  */
 static inline int __down_read_trylock(struct rw_semaphore *sem)
 {
-	rwsem_count_t result, tmp;
+	long result, tmp;
 	asm volatile("# beginning __down_read_trylock\n\t"
 		     "  mov          %0,%1\n\t"
 		     "1:\n\t"
@@ -149,7 +147,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
  */
 static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
 {
-	rwsem_count_t tmp;
+	long tmp;
 	asm volatile("# beginning down_write\n\t"
 		     LOCK_PREFIX "  xadd      %1,(%2)\n\t"
 		     /* adds 0xffff0001, returns the old value */
@@ -174,9 +172,8 @@ static inline void __down_write(struct rw_semaphore *sem)
  */
 static inline int __down_write_trylock(struct rw_semaphore *sem)
 {
-	rwsem_count_t ret = cmpxchg(&sem->count,
-				    RWSEM_UNLOCKED_VALUE,
-				    RWSEM_ACTIVE_WRITE_BIAS);
+	long ret = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE,
+			   RWSEM_ACTIVE_WRITE_BIAS);
 	if (ret == RWSEM_UNLOCKED_VALUE)
 		return 1;
 	return 0;
@@ -187,7 +184,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
  */
 static inline void __up_read(struct rw_semaphore *sem)
 {
-	rwsem_count_t tmp;
+	long tmp;
 	asm volatile("# beginning __up_read\n\t"
 		     LOCK_PREFIX "  xadd      %1,(%2)\n\t"
 		     /* subtracts 1, returns the old value */
@@ -205,7 +202,7 @@ static inline void __up_read(struct rw_semaphore *sem)
  */
 static inline void __up_write(struct rw_semaphore *sem)
 {
-	rwsem_count_t tmp;
+	long tmp;
 	asm volatile("# beginning __up_write\n\t"
 		     LOCK_PREFIX "  xadd      %1,(%2)\n\t"
 		     /* subtracts 0xffff0001, returns the old value */
@@ -241,8 +238,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 /*
  * implement atomic add functionality
  */
-static inline void rwsem_atomic_add(rwsem_count_t delta,
-				    struct rw_semaphore *sem)
+static inline void rwsem_atomic_add(long delta, struct rw_semaphore *sem)
 {
 	asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0"
 		     : "+m" (sem->count)
@@ -252,10 +248,9 @@ static inline void rwsem_atomic_add(rwsem_count_t delta,
 /*
  * implement exchange and add functionality
  */
-static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta,
-						struct rw_semaphore *sem)
+static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
 {
-	rwsem_count_t tmp = delta;
+	long tmp = delta;
 
 	asm volatile(LOCK_PREFIX "xadd %0,%1"
 		     : "+r" (tmp), "+m" (sem->count)
-- 
cgit v1.2.3


From 1c8ed640d918290ddc1de5ada02ef6686a733c9f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Jan 2011 20:05:56 +0000
Subject: rwsem: Move duplicate struct rwsem declaration to linux/rwsem.h

The difference between these declarations is the data type of the
count member and the lack of lockdep in some architectures/

long is equivivalent to signed long and the #ifdef guarded dep_map
member does not hurt anyone.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Matt Turner <mattst88@gmail.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Acked-by: David Miller <davem@davemloft.net>
Cc: Chris Zankel <chris@zankel.net>
LKML-Reference: <20110126195833.679641914@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/alpha/include/asm/rwsem.h   |  8 --------
 arch/ia64/include/asm/rwsem.h    |  9 ---------
 arch/powerpc/include/asm/rwsem.h |  9 ---------
 arch/s390/include/asm/rwsem.h    | 12 ------------
 arch/sh/include/asm/rwsem.h      | 12 +-----------
 arch/sparc/include/asm/rwsem.h   |  9 +--------
 arch/x86/include/asm/rwsem.h     | 12 ------------
 arch/xtensa/include/asm/rwsem.h  |  9 +--------
 include/linux/rwsem.h            | 13 ++++++++++++-
 9 files changed, 15 insertions(+), 78 deletions(-)

diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h
index 839a3fa20944..3e5619ead29d 100644
--- a/arch/alpha/include/asm/rwsem.h
+++ b/arch/alpha/include/asm/rwsem.h
@@ -19,20 +19,12 @@ extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
 extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
 
-/*
- * the semaphore definition
- */
-struct rw_semaphore {
-	long			count;
 #define RWSEM_UNLOCKED_VALUE		0x0000000000000000L
 #define RWSEM_ACTIVE_BIAS		0x0000000000000001L
 #define RWSEM_ACTIVE_MASK		0x00000000ffffffffL
 #define RWSEM_WAITING_BIAS		(-0x0000000100000000L)
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
-	spinlock_t		wait_lock;
-	struct list_head	wait_list;
-};
 
 #define __RWSEM_INITIALIZER(name) \
 	{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock),	\
diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h
index 9bcf0792b01c..1fe465804dc7 100644
--- a/arch/ia64/include/asm/rwsem.h
+++ b/arch/ia64/include/asm/rwsem.h
@@ -27,15 +27,6 @@
 
 #include <asm/intrinsics.h>
 
-/*
- * the semaphore definition
- */
-struct rw_semaphore {
-	signed long		count;
-	spinlock_t		wait_lock;
-	struct list_head	wait_list;
-};
-
 #define RWSEM_UNLOCKED_VALUE		__IA64_UL_CONST(0x0000000000000000)
 #define RWSEM_ACTIVE_BIAS		(1L)
 #define RWSEM_ACTIVE_MASK		(0xffffffffL)
diff --git a/arch/powerpc/include/asm/rwsem.h b/arch/powerpc/include/asm/rwsem.h
index c12abbe1d06f..bc1acc229223 100644
--- a/arch/powerpc/include/asm/rwsem.h
+++ b/arch/powerpc/include/asm/rwsem.h
@@ -28,15 +28,6 @@
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-struct rw_semaphore {
-	long			count;
-	spinlock_t		wait_lock;
-	struct list_head	wait_list;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map	dep_map;
-#endif
-};
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
 #else
diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h
index 9cc8592b33bb..6e075f1d97b4 100644
--- a/arch/s390/include/asm/rwsem.h
+++ b/arch/s390/include/asm/rwsem.h
@@ -49,18 +49,6 @@ extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
 extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *);
 extern struct rw_semaphore *rwsem_downgrade_write(struct rw_semaphore *);
 
-/*
- * the semaphore definition
- */
-struct rw_semaphore {
-	signed long		count;
-	spinlock_t		wait_lock;
-	struct list_head	wait_list;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map	dep_map;
-#endif
-};
-
 #ifndef __s390x__
 #define RWSEM_UNLOCKED_VALUE	0x00000000
 #define RWSEM_ACTIVE_BIAS	0x00000001
diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h
index df6f34623c54..dffc62589f79 100644
--- a/arch/sh/include/asm/rwsem.h
+++ b/arch/sh/include/asm/rwsem.h
@@ -11,23 +11,13 @@
 #endif
 
 #ifdef __KERNEL__
-/*
- * the semaphore definition
- */
-struct rw_semaphore {
-	long		count;
+
 #define RWSEM_UNLOCKED_VALUE		0x00000000
 #define RWSEM_ACTIVE_BIAS		0x00000001
 #define RWSEM_ACTIVE_MASK		0x0000ffff
 #define RWSEM_WAITING_BIAS		(-0x00010000)
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
-	spinlock_t		wait_lock;
-	struct list_head	wait_list;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map	dep_map;
-#endif
-};
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h
index 902d36bf150d..4c16d1de2ab5 100644
--- a/arch/sparc/include/asm/rwsem.h
+++ b/arch/sparc/include/asm/rwsem.h
@@ -12,20 +12,13 @@
 #endif
 
 #ifdef __KERNEL__
-struct rw_semaphore {
-	signed long			count;
+
 #define RWSEM_UNLOCKED_VALUE		0x00000000L
 #define RWSEM_ACTIVE_BIAS		0x00000001L
 #define RWSEM_ACTIVE_MASK		0xffffffffL
 #define RWSEM_WAITING_BIAS		(-RWSEM_ACTIVE_MASK-1)
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
-	spinlock_t			wait_lock;
-	struct list_head		wait_list;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map		dep_map;
-#endif
-};
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index c30206c2bbf9..995cfe4c985d 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -49,8 +49,6 @@ extern asmregparm struct rw_semaphore *
  rwsem_downgrade_wake(struct rw_semaphore *sem);
 
 /*
- * the semaphore definition
- *
  * The bias values and the counter type limits the number of
  * potential readers/writers to 32767 for 32 bits and 2147483647
  * for 64 bits.
@@ -68,22 +66,12 @@ extern asmregparm struct rw_semaphore *
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-struct rw_semaphore {
-	long			count;
-	spinlock_t		wait_lock;
-	struct list_head	wait_list;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map dep_map;
-#endif
-};
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
 #else
 # define __RWSEM_DEP_MAP_INIT(lockname)
 #endif
 
-
 #define __RWSEM_INITIALIZER(name)				\
 {								\
 	RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \
diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h
index 1be2102c648e..585cab9b0bdf 100644
--- a/arch/xtensa/include/asm/rwsem.h
+++ b/arch/xtensa/include/asm/rwsem.h
@@ -16,20 +16,13 @@
 #ifndef _LINUX_RWSEM_H
 #error "Please don't include <asm/rwsem.h> directly, use <linux/rwsem.h> instead."
 #endif
-/*
- * the semaphore definition
- */
-struct rw_semaphore {
-	signed long		count;
+
 #define RWSEM_UNLOCKED_VALUE		0x00000000
 #define RWSEM_ACTIVE_BIAS		0x00000001
 #define RWSEM_ACTIVE_MASK		0x0000ffff
 #define RWSEM_WAITING_BIAS		(-0x00010000)
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
-	spinlock_t		wait_lock;
-	struct list_head	wait_list;
-};
 
 #define __RWSEM_INITIALIZER(name) \
 	{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock),	\
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 496296d12d62..e8be18edb664 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -22,7 +22,18 @@ struct rw_semaphore;
 #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
 #include <linux/rwsem-spinlock.h> /* use a generic implementation */
 #else
-#include <asm/rwsem.h> /* use an arch-specific implementation */
+/* All arch specific implementations share the same struct */
+struct rw_semaphore {
+	long			count;
+	spinlock_t		wait_lock;
+	struct list_head	wait_list;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+};
+
+/* Include the arch specific part */
+#include <asm/rwsem.h>
 #endif
 
 /*
-- 
cgit v1.2.3


From 12249b34414dba7f386aadcf6be7ca36c6878300 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Jan 2011 20:06:00 +0000
Subject: rwsem: Move duplicate init macros and functions to linux/rwsem.h

The rwsem initializers and related macros and functions are mostly the
same. Some of them lack the lockdep initializer, but having it in
place does not matter for architectures which do not support lockdep.

powerpc, sparc, x86: No functional change

sh, s390: Removes the duplicate init_rwsem (inline and #define)

alpha, ia64, xtensa: Use the lockdep capable init function in
       	     	     lib/rwsem.c which is just uninlining the init
       	     	     function for the LOCKDEP=n case

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Matt Turner <mattst88@gmail.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Acked-by: David Miller <davem@davemloft.net>
Cc: Chris Zankel <chris@zankel.net>
LKML-Reference: <20110126195833.771812729@linutronix.de>
---
 arch/alpha/include/asm/rwsem.h   | 14 --------------
 arch/ia64/include/asm/rwsem.h    | 15 ---------------
 arch/powerpc/include/asm/rwsem.h | 27 ---------------------------
 arch/s390/include/asm/rwsem.h    | 35 -----------------------------------
 arch/sh/include/asm/rwsem.h      | 31 -------------------------------
 arch/sparc/include/asm/rwsem.h   | 23 -----------------------
 arch/x86/include/asm/rwsem.h     | 25 -------------------------
 arch/xtensa/include/asm/rwsem.h  | 14 --------------
 include/linux/rwsem-spinlock.h   | 23 +----------------------
 include/linux/rwsem.h            | 25 +++++++++++++++++++++++++
 10 files changed, 26 insertions(+), 206 deletions(-)

diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h
index 3e5619ead29d..c9f5b90e926e 100644
--- a/arch/alpha/include/asm/rwsem.h
+++ b/arch/alpha/include/asm/rwsem.h
@@ -26,20 +26,6 @@ extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-#define __RWSEM_INITIALIZER(name) \
-	{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock),	\
-	LIST_HEAD_INIT((name).wait_list) }
-
-#define DECLARE_RWSEM(name) \
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
-
-static inline void init_rwsem(struct rw_semaphore *sem)
-{
-	sem->count = RWSEM_UNLOCKED_VALUE;
-	spin_lock_init(&sem->wait_lock);
-	INIT_LIST_HEAD(&sem->wait_list);
-}
-
 static inline void __down_read(struct rw_semaphore *sem)
 {
 	long oldcount;
diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h
index 1fe465804dc7..379854630e9d 100644
--- a/arch/ia64/include/asm/rwsem.h
+++ b/arch/ia64/include/asm/rwsem.h
@@ -34,26 +34,11 @@
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-#define __RWSEM_INITIALIZER(name) \
-	{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \
-	  LIST_HEAD_INIT((name).wait_list) }
-
-#define DECLARE_RWSEM(name) \
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
-
 extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
 
-static inline void
-init_rwsem (struct rw_semaphore *sem)
-{
-	sem->count = RWSEM_UNLOCKED_VALUE;
-	spin_lock_init(&sem->wait_lock);
-	INIT_LIST_HEAD(&sem->wait_list);
-}
-
 /*
  * lock for reading
  */
diff --git a/arch/powerpc/include/asm/rwsem.h b/arch/powerpc/include/asm/rwsem.h
index bc1acc229223..f86fdf743afb 100644
--- a/arch/powerpc/include/asm/rwsem.h
+++ b/arch/powerpc/include/asm/rwsem.h
@@ -28,38 +28,11 @@
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
-#else
-# define __RWSEM_DEP_MAP_INIT(lockname)
-#endif
-
-#define __RWSEM_INITIALIZER(name)				\
-{								\
-	RWSEM_UNLOCKED_VALUE,					\
-	__SPIN_LOCK_UNLOCKED((name).wait_lock),			\
-	LIST_HEAD_INIT((name).wait_list)			\
-	__RWSEM_DEP_MAP_INIT(name)				\
-}
-
-#define DECLARE_RWSEM(name)		\
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
-
 extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
 
-extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
-			 struct lock_class_key *key);
-
-#define init_rwsem(sem)					\
-	do {						\
-		static struct lock_class_key __key;	\
-							\
-		__init_rwsem((sem), #sem, &__key);	\
-	} while (0)
-
 /*
  * lock for reading
  */
diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h
index 6e075f1d97b4..f0f527756ee1 100644
--- a/arch/s390/include/asm/rwsem.h
+++ b/arch/s390/include/asm/rwsem.h
@@ -63,41 +63,6 @@ extern struct rw_semaphore *rwsem_downgrade_write(struct rw_semaphore *);
 #define RWSEM_ACTIVE_READ_BIAS	RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS	(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-/*
- * initialisation
- */
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
-#else
-# define __RWSEM_DEP_MAP_INIT(lockname)
-#endif
-
-#define __RWSEM_INITIALIZER(name) \
- { RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait.lock), \
-   LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) }
-
-#define DECLARE_RWSEM(name) \
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
-
-static inline void init_rwsem(struct rw_semaphore *sem)
-{
-	sem->count = RWSEM_UNLOCKED_VALUE;
-	spin_lock_init(&sem->wait_lock);
-	INIT_LIST_HEAD(&sem->wait_list);
-}
-
-extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
-			 struct lock_class_key *key);
-
-#define init_rwsem(sem)				\
-do {						\
-	static struct lock_class_key __key;	\
-						\
-	__init_rwsem((sem), #sem, &__key);	\
-} while (0)
-
-
 /*
  * lock for reading
  */
diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h
index dffc62589f79..798699d0687b 100644
--- a/arch/sh/include/asm/rwsem.h
+++ b/arch/sh/include/asm/rwsem.h
@@ -19,42 +19,11 @@
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
-#else
-# define __RWSEM_DEP_MAP_INIT(lockname)
-#endif
-
-#define __RWSEM_INITIALIZER(name) \
-	{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \
-	  LIST_HEAD_INIT((name).wait_list) \
-	  __RWSEM_DEP_MAP_INIT(name) }
-
-#define DECLARE_RWSEM(name)		\
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
-
 extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
 
-extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
-			 struct lock_class_key *key);
-
-#define init_rwsem(sem)				\
-do {						\
-	static struct lock_class_key __key;	\
-						\
-	__init_rwsem((sem), #sem, &__key);	\
-} while (0)
-
-static inline void init_rwsem(struct rw_semaphore *sem)
-{
-	sem->count = RWSEM_UNLOCKED_VALUE;
-	spin_lock_init(&sem->wait_lock);
-	INIT_LIST_HEAD(&sem->wait_list);
-}
-
 /*
  * lock for reading
  */
diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h
index 4c16d1de2ab5..79f7c1ca6f91 100644
--- a/arch/sparc/include/asm/rwsem.h
+++ b/arch/sparc/include/asm/rwsem.h
@@ -20,34 +20,11 @@
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
-#else
-# define __RWSEM_DEP_MAP_INIT(lockname)
-#endif
-
-#define __RWSEM_INITIALIZER(name) \
-{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \
-  LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) }
-
-#define DECLARE_RWSEM(name) \
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
-
 extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
 
-extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
-			 struct lock_class_key *key);
-
-#define init_rwsem(sem)						\
-do {								\
-	static struct lock_class_key __key;			\
-								\
-	__init_rwsem((sem), #sem, &__key);			\
-} while (0)
-
 /*
  * lock for reading
  */
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index 995cfe4c985d..c0c91b4ecc81 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -66,31 +66,6 @@ extern asmregparm struct rw_semaphore *
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
-#else
-# define __RWSEM_DEP_MAP_INIT(lockname)
-#endif
-
-#define __RWSEM_INITIALIZER(name)				\
-{								\
-	RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \
-	LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) \
-}
-
-#define DECLARE_RWSEM(name)					\
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
-
-extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
-			 struct lock_class_key *key);
-
-#define init_rwsem(sem)						\
-do {								\
-	static struct lock_class_key __key;			\
-								\
-	__init_rwsem((sem), #sem, &__key);			\
-} while (0)
-
 /*
  * lock for reading
  */
diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h
index 585cab9b0bdf..2fa420212353 100644
--- a/arch/xtensa/include/asm/rwsem.h
+++ b/arch/xtensa/include/asm/rwsem.h
@@ -24,25 +24,11 @@
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-#define __RWSEM_INITIALIZER(name) \
-	{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock),	\
-	  LIST_HEAD_INIT((name).wait_list) }
-
-#define DECLARE_RWSEM(name)		\
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
-
 extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
 
-static inline void init_rwsem(struct rw_semaphore *sem)
-{
-	sem->count = RWSEM_UNLOCKED_VALUE;
-	spin_lock_init(&sem->wait_lock);
-	INIT_LIST_HEAD(&sem->wait_list);
-}
-
 /*
  * lock for reading
  */
diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h
index 8c0dc7fc07a4..34701241b673 100644
--- a/include/linux/rwsem-spinlock.h
+++ b/include/linux/rwsem-spinlock.h
@@ -29,28 +29,7 @@ struct rw_semaphore {
 #endif
 };
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
-#else
-# define __RWSEM_DEP_MAP_INIT(lockname)
-#endif
-
-#define __RWSEM_INITIALIZER(name) \
-{ 0, __SPIN_LOCK_UNLOCKED(name.wait_lock), LIST_HEAD_INIT((name).wait_list) \
-  __RWSEM_DEP_MAP_INIT(name) }
-
-#define DECLARE_RWSEM(name) \
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
-
-extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
-			 struct lock_class_key *key);
-
-#define init_rwsem(sem)						\
-do {								\
-	static struct lock_class_key __key;			\
-								\
-	__init_rwsem((sem), #sem, &__key);			\
-} while (0)
+#define RWSEM_UNLOCKED_VALUE		0x00000000
 
 extern void __down_read(struct rw_semaphore *sem);
 extern int __down_read_trylock(struct rw_semaphore *sem);
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index e8be18edb664..8970c3e802e2 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -36,6 +36,31 @@ struct rw_semaphore {
 #include <asm/rwsem.h>
 #endif
 
+/* Common initializer macros and functions */
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
+#else
+# define __RWSEM_DEP_MAP_INIT(lockname)
+#endif
+
+#define __RWSEM_INITIALIZER(name) \
+	{ RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED(name.wait_lock),	\
+	  LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) }
+
+#define DECLARE_RWSEM(name) \
+	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
+
+extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
+			 struct lock_class_key *key);
+
+#define init_rwsem(sem)						\
+do {								\
+	static struct lock_class_key __key;			\
+								\
+	__init_rwsem((sem), #sem, &__key);			\
+} while (0)
+
 /*
  * lock for reading
  */
-- 
cgit v1.2.3


From 41e5887fa39ab272d9266a09cbefdef270e28b93 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Jan 2011 20:06:03 +0000
Subject: rwsem: Unify the duplicate rwsem_is_locked() inlines

Instead of having the same implementation in each architecture, move
it to linux/rwsem.h and remove the duplicates. It's unlikely that an
arch will ever implement something different, but we can deal with
that when it happens.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Matt Turner <mattst88@gmail.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Acked-by: David Miller <davem@davemloft.net>
Cc: Chris Zankel <chris@zankel.net>
LKML-Reference: <20110126195833.876773757@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/alpha/include/asm/rwsem.h   | 5 -----
 arch/ia64/include/asm/rwsem.h    | 5 -----
 arch/powerpc/include/asm/rwsem.h | 5 -----
 arch/s390/include/asm/rwsem.h    | 5 -----
 arch/sh/include/asm/rwsem.h      | 5 -----
 arch/sparc/include/asm/rwsem.h   | 5 -----
 arch/x86/include/asm/rwsem.h     | 5 -----
 arch/xtensa/include/asm/rwsem.h  | 5 -----
 include/linux/rwsem.h            | 7 +++++++
 9 files changed, 7 insertions(+), 40 deletions(-)

diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h
index c9f5b90e926e..fc7f54337c3a 100644
--- a/arch/alpha/include/asm/rwsem.h
+++ b/arch/alpha/include/asm/rwsem.h
@@ -224,10 +224,5 @@ static inline long rwsem_atomic_update(long val, struct rw_semaphore *sem)
 #endif
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return (sem->count != 0);
-}
-
 #endif /* __KERNEL__ */
 #endif /* _ALPHA_RWSEM_H */
diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h
index 379854630e9d..148b61a96b8c 100644
--- a/arch/ia64/include/asm/rwsem.h
+++ b/arch/ia64/include/asm/rwsem.h
@@ -147,9 +147,4 @@ __downgrade_write (struct rw_semaphore *sem)
 #define rwsem_atomic_add(delta, sem)	atomic64_add(delta, (atomic64_t *)(&(sem)->count))
 #define rwsem_atomic_update(delta, sem)	atomic64_add_return(delta, (atomic64_t *)(&(sem)->count))
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return (sem->count != 0);
-}
-
 #endif /* _ASM_IA64_RWSEM_H */
diff --git a/arch/powerpc/include/asm/rwsem.h b/arch/powerpc/include/asm/rwsem.h
index f86fdf743afb..38e8e5c508d5 100644
--- a/arch/powerpc/include/asm/rwsem.h
+++ b/arch/powerpc/include/asm/rwsem.h
@@ -133,10 +133,5 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
 	return atomic_long_add_return(delta, (atomic_long_t *)&sem->count);
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return sem->count != 0;
-}
-
 #endif	/* __KERNEL__ */
 #endif	/* _ASM_POWERPC_RWSEM_H */
diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h
index f0f527756ee1..80522dcb6cc2 100644
--- a/arch/s390/include/asm/rwsem.h
+++ b/arch/s390/include/asm/rwsem.h
@@ -325,10 +325,5 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
 	return new;
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return (sem->count != 0);
-}
-
 #endif /* __KERNEL__ */
 #endif /* _S390_RWSEM_H */
diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h
index 798699d0687b..2f8cf9761eb5 100644
--- a/arch/sh/include/asm/rwsem.h
+++ b/arch/sh/include/asm/rwsem.h
@@ -133,10 +133,5 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
 	return atomic_add_return(delta, (atomic_t *)(&sem->count));
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return (sem->count != 0);
-}
-
 #endif /* __KERNEL__ */
 #endif /* _ASM_SH_RWSEM_H */
diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h
index 79f7c1ca6f91..4f4391fd5658 100644
--- a/arch/sparc/include/asm/rwsem.h
+++ b/arch/sparc/include/asm/rwsem.h
@@ -124,11 +124,6 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
 	return atomic64_add_return(delta, (atomic64_t *)(&sem->count));
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return (sem->count != 0);
-}
-
 #endif /* __KERNEL__ */
 
 #endif /* _SPARC64_RWSEM_H */
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index c0c91b4ecc81..776d76fe2f90 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -222,10 +222,5 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
 	return tmp + delta;
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return (sem->count != 0);
-}
-
 #endif /* __KERNEL__ */
 #endif /* _ASM_X86_RWSEM_H */
diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h
index 2fa420212353..da61633ccba8 100644
--- a/arch/xtensa/include/asm/rwsem.h
+++ b/arch/xtensa/include/asm/rwsem.h
@@ -133,9 +133,4 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
 	return atomic_add_return(delta, (atomic_t *)(&sem->count));
 }
 
-static inline int rwsem_is_locked(struct rw_semaphore *sem)
-{
-	return (sem->count != 0);
-}
-
 #endif	/* _XTENSA_RWSEM_H */
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 8970c3e802e2..8b9c7e9f6910 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -34,6 +34,13 @@ struct rw_semaphore {
 
 /* Include the arch specific part */
 #include <asm/rwsem.h>
+
+/* In all implementations count != 0 means locked */
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+	return sem->count != 0;
+}
+
 #endif
 
 /* Common initializer macros and functions */
-- 
cgit v1.2.3


From aac72277fda6ef788bb8d5deaa502ce9b9b6e472 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Jan 2011 20:06:06 +0000
Subject: rwsem: Move duplicate function prototypes to linux/rwsem.h

All architecture specific rwsem headers carry the same function
prototypes. Just x86 adds asmregparm, which is an empty define on all
other architectures. S390 has a stale rwsem_downgrade_write()
prototype.

Remove the duplicates and add the prototypes to linux/rwsem.h

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Richard Henderson <rth@twiddle.net>
Acked-by: Tony Luck <tony.luck@intel.com>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Acked-by: David Miller <davem@davemloft.net>
Cc: Chris Zankel <chris@zankel.net>
LKML-Reference: <20110126195833.970840140@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/alpha/include/asm/rwsem.h   | 5 -----
 arch/ia64/include/asm/rwsem.h    | 5 -----
 arch/powerpc/include/asm/rwsem.h | 5 -----
 arch/s390/include/asm/rwsem.h    | 6 ------
 arch/sh/include/asm/rwsem.h      | 5 -----
 arch/sparc/include/asm/rwsem.h   | 5 -----
 arch/x86/include/asm/rwsem.h     | 9 ---------
 arch/xtensa/include/asm/rwsem.h  | 5 -----
 include/linux/rwsem.h            | 5 +++++
 9 files changed, 5 insertions(+), 45 deletions(-)

diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h
index fc7f54337c3a..a83bbea62c67 100644
--- a/arch/alpha/include/asm/rwsem.h
+++ b/arch/alpha/include/asm/rwsem.h
@@ -14,11 +14,6 @@
 
 #include <linux/compiler.h>
 
-extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
-extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
-
 #define RWSEM_UNLOCKED_VALUE		0x0000000000000000L
 #define RWSEM_ACTIVE_BIAS		0x0000000000000001L
 #define RWSEM_ACTIVE_MASK		0x00000000ffffffffL
diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h
index 148b61a96b8c..3027e7516d85 100644
--- a/arch/ia64/include/asm/rwsem.h
+++ b/arch/ia64/include/asm/rwsem.h
@@ -34,11 +34,6 @@
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
-
 /*
  * lock for reading
  */
diff --git a/arch/powerpc/include/asm/rwsem.h b/arch/powerpc/include/asm/rwsem.h
index 38e8e5c508d5..bb1e2cdeb9bf 100644
--- a/arch/powerpc/include/asm/rwsem.h
+++ b/arch/powerpc/include/asm/rwsem.h
@@ -28,11 +28,6 @@
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
-
 /*
  * lock for reading
  */
diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h
index 80522dcb6cc2..d0eb4653cebd 100644
--- a/arch/s390/include/asm/rwsem.h
+++ b/arch/s390/include/asm/rwsem.h
@@ -43,12 +43,6 @@
 
 #ifdef __KERNEL__
 
-extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *);
-extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *);
-extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
-extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *);
-extern struct rw_semaphore *rwsem_downgrade_write(struct rw_semaphore *);
-
 #ifndef __s390x__
 #define RWSEM_UNLOCKED_VALUE	0x00000000
 #define RWSEM_ACTIVE_BIAS	0x00000001
diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h
index 2f8cf9761eb5..edab57265293 100644
--- a/arch/sh/include/asm/rwsem.h
+++ b/arch/sh/include/asm/rwsem.h
@@ -19,11 +19,6 @@
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
-
 /*
  * lock for reading
  */
diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h
index 4f4391fd5658..069bf4d663a1 100644
--- a/arch/sparc/include/asm/rwsem.h
+++ b/arch/sparc/include/asm/rwsem.h
@@ -20,11 +20,6 @@
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
-
 /*
  * lock for reading
  */
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index 776d76fe2f90..df4cd32b4cc6 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -39,15 +39,6 @@
 #ifdef __KERNEL__
 #include <asm/asm.h>
 
-extern asmregparm struct rw_semaphore *
- rwsem_down_read_failed(struct rw_semaphore *sem);
-extern asmregparm struct rw_semaphore *
- rwsem_down_write_failed(struct rw_semaphore *sem);
-extern asmregparm struct rw_semaphore *
- rwsem_wake(struct rw_semaphore *);
-extern asmregparm struct rw_semaphore *
- rwsem_downgrade_wake(struct rw_semaphore *sem);
-
 /*
  * The bias values and the counter type limits the number of
  * potential readers/writers to 32767 for 32 bits and 2147483647
diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h
index da61633ccba8..249619e7e7f2 100644
--- a/arch/xtensa/include/asm/rwsem.h
+++ b/arch/xtensa/include/asm/rwsem.h
@@ -24,11 +24,6 @@
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 
-extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
-
 /*
  * lock for reading
  */
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 8b9c7e9f6910..f7c957f3165f 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -32,6 +32,11 @@ struct rw_semaphore {
 #endif
 };
 
+extern asmregparm struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
+extern asmregparm struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
+extern asmregparm struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
+extern asmregparm struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
+
 /* Include the arch specific part */
 #include <asm/rwsem.h>
 
-- 
cgit v1.2.3


From d123375425d7df4b6081a631fc1203fceafa59b2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Jan 2011 21:32:01 +0100
Subject: rwsem: Remove redundant asmregparm annotation

Peter Zijlstra pointed out, that the only user of asmregparm (x86) is
compiling the kernel already with -mregparm=3. So the annotation of
the rwsem functions is redundant. Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: David Miller <davem@davemloft.net>
Cc: Chris Zankel <chris@zankel.net>
LKML-Reference: <alpine.LFD.2.00.1101262130450.31804@localhost6.localdomain6>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/rwsem.h |  8 ++++----
 lib/rwsem.c           | 10 ++++------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index f7c957f3165f..a8afe9cd000c 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -32,10 +32,10 @@ struct rw_semaphore {
 #endif
 };
 
-extern asmregparm struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
-extern asmregparm struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
-extern asmregparm struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
-extern asmregparm struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
+extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
 
 /* Include the arch specific part */
 #include <asm/rwsem.h>
diff --git a/lib/rwsem.c b/lib/rwsem.c
index f236d7cd5cf3..aa7c3052261f 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -222,8 +222,7 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
 /*
  * wait for the read lock to be granted
  */
-asmregparm struct rw_semaphore __sched *
-rwsem_down_read_failed(struct rw_semaphore *sem)
+struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 {
 	return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_READ,
 					-RWSEM_ACTIVE_READ_BIAS);
@@ -232,8 +231,7 @@ rwsem_down_read_failed(struct rw_semaphore *sem)
 /*
  * wait for the write lock to be granted
  */
-asmregparm struct rw_semaphore __sched *
-rwsem_down_write_failed(struct rw_semaphore *sem)
+struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 {
 	return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_WRITE,
 					-RWSEM_ACTIVE_WRITE_BIAS);
@@ -243,7 +241,7 @@ rwsem_down_write_failed(struct rw_semaphore *sem)
  * handle waking up a waiter on the semaphore
  * - up_read/up_write has decremented the active part of count if we come here
  */
-asmregparm struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
+struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
 {
 	unsigned long flags;
 
@@ -263,7 +261,7 @@ asmregparm struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
  * - caller incremented waiting part of count and discovered it still negative
  * - just wake up any readers at the front of the queue
  */
-asmregparm struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
+struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
 {
 	unsigned long flags;
 
-- 
cgit v1.2.3


From 6ea72f12069306b235151c5b05ac0cca7e1dedfa Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 26 Jan 2011 13:36:03 +0100
Subject: sched: Avoid expensive initial update_cfs_load(), on UP too

Fix the build on UP.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Turner <pjt@google.com>
LKML-Reference: <20110122044852.102126037@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/sched.c b/kernel/sched.c
index 78fa75394011..477e1bcc63f9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7922,7 +7922,9 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	cfs_rq->rq = rq;
 	/* allow initial update_cfs_load() to truncate */
+#ifdef CONFIG_SMP
 	cfs_rq->load_stamp = 1;
+#endif
 #endif
 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 }
-- 
cgit v1.2.3


From 88d4f0db7fa8785859c1d637f9aac210932b6216 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 25 Jan 2011 19:40:51 +0100
Subject: perf: Fix alloc_callchain_buffers()

Commit 927c7a9e92c4 ("perf: Fix race in callchains") introduced
a mismatch in the sizing of struct callchain_cpus_entries.

nr_cpu_ids must be used instead of num_possible_cpus(), or we
might get out of bound memory accesses on some machines.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Miller <davem@davemloft.net>
Cc: Stephane Eranian <eranian@google.com>
CC: stable@kernel.org
LKML-Reference: <1295980851.3588.351.camel@edumazet-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 126a302c481c..852ae8c66502 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1999,8 +1999,7 @@ static int alloc_callchain_buffers(void)
 	 * accessed from NMI. Use a temporary manual per cpu allocation
 	 * until that gets sorted out.
 	 */
-	size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) *
-		num_possible_cpus();
+	size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
 
 	entries = kzalloc(size, GFP_KERNEL);
 	if (!entries)
-- 
cgit v1.2.3


From d038b12c6d773a4b9f69ca5243773bf6314f7ee9 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Tue, 25 Jan 2011 17:32:01 +0200
Subject: perf: Fix Pentium4 raw event validation

This patch fixes some issues with raw event validation on
Pentium 4 (Netburst) based processors.

As I was testing libpfm4 Netburst support, I ran into two
problems in the p4_validate_raw_event() function:

   - the shared field must be checked ONLY when HT is on
   - the binding to ESCR register was missing

The second item was causing raw events to not be encoded
correctly compared to generic PMU events.

With this patch, I can now pass Netburst events to libpfm4
examples and get meaningful results:

  $ task -e global_power_events:running:u  noploop 1
  noploop for 1 seconds
  3,206,304,898 global_power_events:running

Signed-off-by: Stephane Eranian <eranian@google.com>
Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: peterz@infradead.org
Cc: paulus@samba.org
Cc: davem@davemloft.net
Cc: fweisbec@gmail.com
Cc: perfmon2-devel@lists.sf.net
Cc: eranian@gmail.com
Cc: robert.richter@amd.com
Cc: acme@redhat.com
Cc: gorcunov@gmail.com
Cc: ming.m.lin@intel.com
LKML-Reference: <4d3efb2f.1252d80a.1a80.ffffc83f@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index e56b9bfbabd1..f7a0993c1e7c 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -682,7 +682,7 @@ static int p4_validate_raw_event(struct perf_event *event)
 	 * if an event is shared accross the logical threads
 	 * the user needs special permissions to be able to use it
 	 */
-	if (p4_event_bind_map[v].shared) {
+	if (p4_ht_active() && p4_event_bind_map[v].shared) {
 		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
 			return -EACCES;
 	}
@@ -727,7 +727,8 @@ static int p4_hw_config(struct perf_event *event)
 		event->hw.config = p4_set_ht_bit(event->hw.config);
 
 	if (event->attr.type == PERF_TYPE_RAW) {
-
+		struct p4_event_bind *bind;
+		unsigned int esel;
 		/*
 		 * Clear bits we reserve to be managed by kernel itself
 		 * and never allowed from a user space
@@ -743,6 +744,13 @@ static int p4_hw_config(struct perf_event *event)
 		 * bits since we keep additional info here (for cache events and etc)
 		 */
 		event->hw.config |= event->attr.config;
+		bind = p4_config_get_bind(event->attr.config);
+		if (!bind) {
+			rc = -EINVAL;
+			goto out;
+		}
+		esel = P4_OPCODE_ESEL(bind->opcode);
+		event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
 	}
 
 	rc = x86_setup_perfctr(event);
-- 
cgit v1.2.3


From dda99116969142cc41e945a1047a419b937536af Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 21 Jan 2011 15:30:01 -0800
Subject: x86, perf: Change two init functions to static

init_hw_perf_events() is called via early_initcall now.
x86_pmu_event_init is x86_pmu member function.

So we can change them to static.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
LKML-Reference: <4D3A16F9.109@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 9d977a2ea693..4d98789b0664 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1389,7 +1389,7 @@ static void __init pmu_check_apic(void)
 	pr_info("no hardware sampling interrupt available.\n");
 }
 
-int __init init_hw_perf_events(void)
+static int __init init_hw_perf_events(void)
 {
 	struct event_constraint *c;
 	int err;
@@ -1608,7 +1608,7 @@ out:
 	return ret;
 }
 
-int x86_pmu_event_init(struct perf_event *event)
+static int x86_pmu_event_init(struct perf_event *event)
 {
 	struct pmu *tmp;
 	int err;
-- 
cgit v1.2.3


From 1fb0ef31f428f345a7c3666f8e7444a563edd537 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 31 Jan 2011 08:57:41 +0100
Subject: genirq: Fix affinity notifier fallout

The new code of commit cd7eab44e(genirq: Add IRQ affinity notifiers)
references irq_desc.affinity which fails to compile with
CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED=y.

Use irq_desc.irq_data.affinity instead.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Ben Hutchings <bhutchings@solarflare.com>
---
 kernel/irq/manage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0587c5ceaed8..538fce2db51c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -179,7 +179,7 @@ static void irq_affinity_notify(struct work_struct *work)
 		cpumask_copy(cpumask, desc->pending_mask);
 	else
 #endif
-		cpumask_copy(cpumask, desc->affinity);
+		cpumask_copy(cpumask, desc->irq_data.affinity);
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 
 	notify->notify(notify, cpumask);
-- 
cgit v1.2.3