From 84c4e620d35f49f486a900af214ad12276afb386 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 Feb 2016 18:45:40 +0100
Subject: perf: Close install vs. exit race

Consider the following scenario:

  CPU0					CPU1

  ctx = find_get_ctx();
					perf_event_exit_task_context()
  mutex_lock(&ctx->mutex);
  perf_install_in_context(ctx, ...);
    /* NO-OP */
  mutex_unlock(&ctx->mutex);

  ...

  perf_release()
    WARN_ON_ONCE(event->state != STATE_EXIT);

Since the event doesn't pass through perf_remove_from_context()
because perf_install_in_context() NO-OPs because the ctx is dead, and
perf_event_exit_task_context() will not observe the event because its
not attached yet, the event->state will not be set.

Solve this by revalidating ctx->task after we acquire ctx->mutex and
failing the event creation as a whole.

Tested-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: eranian@google.com
Cc: oleg@redhat.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160224174947.626853419@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 35 ++++++++++++++++++++++++++---------
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0d58522103cd..d7b0316e3465 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2158,13 +2158,15 @@ perf_install_in_context(struct perf_event_context *ctx,
 	 */
 	raw_spin_lock_irq(&ctx->lock);
 	task = ctx->task;
+
 	/*
-	 * Worse, we cannot even rely on the ctx actually existing anymore. If
-	 * between find_get_context() and perf_install_in_context() the task
-	 * went through perf_event_exit_task() its dead and we should not be
-	 * adding new events.
+	 * If between ctx = find_get_context() and mutex_lock(&ctx->mutex) the
+	 * ctx gets destroyed, we must not install an event into it.
+	 *
+	 * This is normally tested for after we acquire the mutex, so this is
+	 * a sanity check.
 	 */
-	if (task == TASK_TOMBSTONE) {
+	if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
 		raw_spin_unlock_irq(&ctx->lock);
 		return;
 	}
@@ -8389,10 +8391,19 @@ SYSCALL_DEFINE5(perf_event_open,
 	if (move_group) {
 		gctx = group_leader->ctx;
 		mutex_lock_double(&gctx->mutex, &ctx->mutex);
+		if (gctx->task == TASK_TOMBSTONE) {
+			err = -ESRCH;
+			goto err_locked;
+		}
 	} else {
 		mutex_lock(&ctx->mutex);
 	}
 
+	if (ctx->task == TASK_TOMBSTONE) {
+		err = -ESRCH;
+		goto err_locked;
+	}
+
 	if (!perf_event_validate_size(event)) {
 		err = -E2BIG;
 		goto err_locked;
@@ -8563,12 +8574,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 
 	WARN_ON_ONCE(ctx->parent_ctx);
 	mutex_lock(&ctx->mutex);
+	if (ctx->task == TASK_TOMBSTONE) {
+		err = -ESRCH;
+		goto err_unlock;
+	}
+
 	if (!exclusive_event_installable(event, ctx)) {
-		mutex_unlock(&ctx->mutex);
-		perf_unpin_context(ctx);
-		put_ctx(ctx);
 		err = -EBUSY;
-		goto err_free;
+		goto err_unlock;
 	}
 
 	perf_install_in_context(ctx, event, cpu);
@@ -8577,6 +8590,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 
 	return event;
 
+err_unlock:
+	mutex_unlock(&ctx->mutex);
+	perf_unpin_context(ctx);
+	put_ctx(ctx);
 err_free:
 	free_event(event);
 err:
-- 
cgit v1.2.3


From 130056275ade730e7a79c110212c8815202773ee Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 Feb 2016 18:45:41 +0100
Subject: perf: Do not double free

In case of: err_file: fput(event_file), we'll end up calling
perf_release() which in turn will free the event.

Do not then free the event _again_.

Tested-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: eranian@google.com
Cc: oleg@redhat.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160224174947.697350349@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index d7b0316e3465..211a5ce64e3f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8520,7 +8520,12 @@ err_context:
 	perf_unpin_context(ctx);
 	put_ctx(ctx);
 err_alloc:
-	free_event(event);
+	/*
+	 * If event_file is set, the fput() above will have called ->release()
+	 * and that will take care of freeing the event.
+	 */
+	if (!event_file)
+		free_event(event);
 err_cpus:
 	put_online_cpus();
 err_task:
-- 
cgit v1.2.3


From a4f4bb6d0c69d0bb573f1d9e6f1b806f9b038b19 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 Feb 2016 18:45:42 +0100
Subject: perf: Allow perf_release() with !event->ctx

In the err_file: fput(event_file) case, the event will not yet have
been attached to a context. However perf_release() does assume it has
been. Cure this.

Tested-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: eranian@google.com
Cc: oleg@redhat.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160224174947.793996260@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 211a5ce64e3f..d5299e2e435d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3754,9 +3754,19 @@ static void put_event(struct perf_event *event)
  */
 int perf_event_release_kernel(struct perf_event *event)
 {
-	struct perf_event_context *ctx;
+	struct perf_event_context *ctx = event->ctx;
 	struct perf_event *child, *tmp;
 
+	/*
+	 * If we got here through err_file: fput(event_file); we will not have
+	 * attached to a context yet.
+	 */
+	if (!ctx) {
+		WARN_ON_ONCE(event->attach_state &
+				(PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
+		goto no_ctx;
+	}
+
 	if (!is_kernel_event(event))
 		perf_remove_from_owner(event);
 
@@ -3832,8 +3842,8 @@ again:
 	}
 	mutex_unlock(&event->child_mutex);
 
-	/* Must be the last reference */
-	put_event(event);
+no_ctx:
+	put_event(event); /* Must be the 'last' reference */
 	return 0;
 }
 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
-- 
cgit v1.2.3


From 6f932e5be1503ab0783699e843db325d44c2fabb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 Feb 2016 18:45:43 +0100
Subject: perf: Only update context time when active

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: eranian@google.com
Cc: oleg@redhat.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160224174947.860690919@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index d5299e2e435d..64698fbfad9f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2170,12 +2170,12 @@ perf_install_in_context(struct perf_event_context *ctx,
 		raw_spin_unlock_irq(&ctx->lock);
 		return;
 	}
-	update_context_time(ctx);
-	/*
-	 * Update cgrp time only if current cgrp matches event->cgrp.
-	 * Must be done before calling add_event_to_ctx().
-	 */
-	update_cgrp_time_from_event(event);
+
+	if (ctx->is_active) {
+		update_context_time(ctx);
+		update_cgrp_time_from_event(event);
+	}
+
 	add_event_to_ctx(event, ctx);
 	raw_spin_unlock_irq(&ctx->lock);
 
-- 
cgit v1.2.3


From a69b0ca4ac3bf5427b571f11cbf33f0a32b728d5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 Feb 2016 18:45:44 +0100
Subject: perf: Fix cloning

Alexander reported that when the 'original' context gets destroyed, no
new clones happen.

This can happen irrespective of the ctx switch optimization, any task
can die, even the parent, and we want to continue monitoring the task
hierarchy until we either close the event or no tasks are left in the
hierarchy.

perf_event_init_context() will attempt to pin the 'parent' context
during clone(). At that point current is the parent, and since current
cannot have exited while executing clone(), its context cannot have
passed through perf_event_exit_task_context(). Therefore
perf_pin_task_context() cannot observe ctx->task == TASK_TOMBSTONE.

However, since inherit_event() does:

	if (parent_event->parent)
		parent_event = parent_event->parent;

it looks at the 'original' event when it does: is_orphaned_event().
This can return true if the context that contains the this event has
passed through perf_event_exit_task_context(). And thus we'll fail to
clone the perf context.

Fix this by adding a new state: STATE_DEAD, which is set by
perf_release() to indicate that the filedesc (or kernel reference) is
dead and there are no observers for our data left.

Only for STATE_DEAD will is_orphaned_event() be true and inhibit
cloning.

STATE_EXIT is otherwise preserved such that is_event_hup() remains
functional and will report when the observed task hierarchy becomes
empty.

Reported-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Tested-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: eranian@google.com
Cc: oleg@redhat.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Cc: vince@deater.net
Fixes: c6e5b73242d2 ("perf: Synchronously clean up child events")
Link: http://lkml.kernel.org/r/20160224174947.919845295@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  1 +
 kernel/events/core.c       | 29 ++++++++++++++---------------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b35a61a481fa..39156619e108 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -397,6 +397,7 @@ struct pmu {
  * enum perf_event_active_state - the states of a event
  */
 enum perf_event_active_state {
+	PERF_EVENT_STATE_DEAD		= -4,
 	PERF_EVENT_STATE_EXIT		= -3,
 	PERF_EVENT_STATE_ERROR		= -2,
 	PERF_EVENT_STATE_OFF		= -1,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 64698fbfad9f..92d6999a4f2f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1645,7 +1645,7 @@ out:
 
 static bool is_orphaned_event(struct perf_event *event)
 {
-	return event->state == PERF_EVENT_STATE_EXIT;
+	return event->state == PERF_EVENT_STATE_DEAD;
 }
 
 static inline int pmu_filter_match(struct perf_event *event)
@@ -1732,7 +1732,6 @@ group_sched_out(struct perf_event *group_event,
 }
 
 #define DETACH_GROUP	0x01UL
-#define DETACH_STATE	0x02UL
 
 /*
  * Cross CPU call to remove a performance event
@@ -1752,8 +1751,6 @@ __perf_remove_from_context(struct perf_event *event,
 	if (flags & DETACH_GROUP)
 		perf_group_detach(event);
 	list_del_event(event, ctx);
-	if (flags & DETACH_STATE)
-		event->state = PERF_EVENT_STATE_EXIT;
 
 	if (!ctx->nr_events && ctx->is_active) {
 		ctx->is_active = 0;
@@ -3772,22 +3769,24 @@ int perf_event_release_kernel(struct perf_event *event)
 
 	ctx = perf_event_ctx_lock(event);
 	WARN_ON_ONCE(ctx->parent_ctx);
-	perf_remove_from_context(event, DETACH_GROUP | DETACH_STATE);
-	perf_event_ctx_unlock(event, ctx);
+	perf_remove_from_context(event, DETACH_GROUP);
 
+	raw_spin_lock_irq(&ctx->lock);
 	/*
-	 * At this point we must have event->state == PERF_EVENT_STATE_EXIT,
-	 * either from the above perf_remove_from_context() or through
-	 * perf_event_exit_event().
+	 * Mark this even as STATE_DEAD, there is no external reference to it
+	 * anymore.
 	 *
-	 * Therefore, anybody acquiring event->child_mutex after the below
-	 * loop _must_ also see this, most importantly inherit_event() which
-	 * will avoid placing more children on the list.
+	 * Anybody acquiring event->child_mutex after the below loop _must_
+	 * also see this, most importantly inherit_event() which will avoid
+	 * placing more children on the list.
 	 *
 	 * Thus this guarantees that we will in fact observe and kill _ALL_
 	 * child events.
 	 */
-	WARN_ON_ONCE(event->state != PERF_EVENT_STATE_EXIT);
+	event->state = PERF_EVENT_STATE_DEAD;
+	raw_spin_unlock_irq(&ctx->lock);
+
+	perf_event_ctx_unlock(event, ctx);
 
 again:
 	mutex_lock(&event->child_mutex);
@@ -4000,7 +3999,7 @@ static bool is_event_hup(struct perf_event *event)
 {
 	bool no_children;
 
-	if (event->state != PERF_EVENT_STATE_EXIT)
+	if (event->state > PERF_EVENT_STATE_EXIT)
 		return false;
 
 	mutex_lock(&event->child_mutex);
@@ -8727,7 +8726,7 @@ perf_event_exit_event(struct perf_event *child_event,
 	if (parent_event)
 		perf_group_detach(child_event);
 	list_del_event(child_event, child_ctx);
-	child_event->state = PERF_EVENT_STATE_EXIT; /* see perf_event_release_kernel() */
+	child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
 	raw_spin_unlock_irq(&child_ctx->lock);
 
 	/*
-- 
cgit v1.2.3


From 9107c89e269d2738019861bb518e3d59bef01781 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 Feb 2016 18:45:45 +0100
Subject: perf: Fix race between event install and jump_labels

perf_install_in_context() relies upon the context switch hooks to have
scheduled in events when the IPI misses its target -- after all, if
the task has moved from the CPU (or wasn't running at all), it will
have to context switch to run elsewhere.

This however doesn't appear to be happening.

It is possible for the IPI to not happen (task wasn't running) only to
later observe the task running with an inactive context.

The only possible explanation is that the context switch hooks are not
called. Therefore put in a sync_sched() after toggling the jump_label
to guarantee all CPUs will have them enabled before we install an
event.

A simple if (0->1) sync_sched() will not in fact work, because any
further increment can race and complete before the sync_sched().
Therefore we must jump through some hoops.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: eranian@google.com
Cc: oleg@redhat.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160224174947.980211985@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  6 +++---
 kernel/events/core.c       | 49 ++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 39156619e108..f5c5a3fa2c81 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -906,7 +906,7 @@ perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
 	}
 }
 
-extern struct static_key_deferred perf_sched_events;
+extern struct static_key_false perf_sched_events;
 
 static __always_inline bool
 perf_sw_migrate_enabled(void)
@@ -925,7 +925,7 @@ static inline void perf_event_task_migrate(struct task_struct *task)
 static inline void perf_event_task_sched_in(struct task_struct *prev,
 					    struct task_struct *task)
 {
-	if (static_key_false(&perf_sched_events.key))
+	if (static_branch_unlikely(&perf_sched_events))
 		__perf_event_task_sched_in(prev, task);
 
 	if (perf_sw_migrate_enabled() && task->sched_migrated) {
@@ -942,7 +942,7 @@ static inline void perf_event_task_sched_out(struct task_struct *prev,
 {
 	perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);
 
-	if (static_key_false(&perf_sched_events.key))
+	if (static_branch_unlikely(&perf_sched_events))
 		__perf_event_task_sched_out(prev, next);
 }
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 92d6999a4f2f..ea064ca8dd3c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -321,7 +321,13 @@ enum event_type_t {
  * perf_sched_events : >0 events exist
  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
  */
-struct static_key_deferred perf_sched_events __read_mostly;
+
+static void perf_sched_delayed(struct work_struct *work);
+DEFINE_STATIC_KEY_FALSE(perf_sched_events);
+static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
+static DEFINE_MUTEX(perf_sched_mutex);
+static atomic_t perf_sched_count;
+
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 
@@ -3536,12 +3542,22 @@ static void unaccount_event(struct perf_event *event)
 	if (has_branch_stack(event))
 		dec = true;
 
-	if (dec)
-		static_key_slow_dec_deferred(&perf_sched_events);
+	if (dec) {
+		if (!atomic_add_unless(&perf_sched_count, -1, 1))
+			schedule_delayed_work(&perf_sched_work, HZ);
+	}
 
 	unaccount_event_cpu(event, event->cpu);
 }
 
+static void perf_sched_delayed(struct work_struct *work)
+{
+	mutex_lock(&perf_sched_mutex);
+	if (atomic_dec_and_test(&perf_sched_count))
+		static_branch_disable(&perf_sched_events);
+	mutex_unlock(&perf_sched_mutex);
+}
+
 /*
  * The following implement mutual exclusion of events on "exclusive" pmus
  * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
@@ -7780,8 +7796,28 @@ static void account_event(struct perf_event *event)
 	if (is_cgroup_event(event))
 		inc = true;
 
-	if (inc)
-		static_key_slow_inc(&perf_sched_events.key);
+	if (inc) {
+		if (atomic_inc_not_zero(&perf_sched_count))
+			goto enabled;
+
+		mutex_lock(&perf_sched_mutex);
+		if (!atomic_read(&perf_sched_count)) {
+			static_branch_enable(&perf_sched_events);
+			/*
+			 * Guarantee that all CPUs observe they key change and
+			 * call the perf scheduling hooks before proceeding to
+			 * install events that need them.
+			 */
+			synchronize_sched();
+		}
+		/*
+		 * Now that we have waited for the sync_sched(), allow further
+		 * increments to by-pass the mutex.
+		 */
+		atomic_inc(&perf_sched_count);
+		mutex_unlock(&perf_sched_mutex);
+	}
+enabled:
 
 	account_event_cpu(event, event->cpu);
 }
@@ -9344,9 +9380,6 @@ void __init perf_event_init(void)
 	ret = init_hw_breakpoint();
 	WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
 
-	/* do not patch jump label more than once per second */
-	jump_label_rate_limit(&perf_sched_events, HZ);
-
 	/*
 	 * Build time assertion that we keep the data_head at the intended
 	 * location.  IOW, validation we got the __reserved[] size right.
-- 
cgit v1.2.3


From 28a967c3a2f99fa3b5f762f25cb2a319d933571b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 Feb 2016 18:45:46 +0100
Subject: perf: Cure event->pending_disable race

Because event_sched_out() checks event->pending_disable _before_
actually disabling the event, it can happen that the event fires after
it checks but before it gets disabled.

This would leave event->pending_disable set and the queued irq_work
will try and process it.

However, if the event trigger was during schedule(), the event might
have been de-scheduled by the time the irq_work runs, and
perf_event_disable_local() will fail.

Fix this by checking event->pending_disable _after_ we call
event->pmu->del(). This depends on the latter being a compiler
barrier, such that the compiler does not lift the load and re-creates
the problem.

Tested-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: eranian@google.com
Cc: oleg@redhat.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160224174948.040469884@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index ea064ca8dd3c..de14b67a3b0b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1696,14 +1696,14 @@ event_sched_out(struct perf_event *event,
 
 	perf_pmu_disable(event->pmu);
 
+	event->tstamp_stopped = tstamp;
+	event->pmu->del(event, 0);
+	event->oncpu = -1;
 	event->state = PERF_EVENT_STATE_INACTIVE;
 	if (event->pending_disable) {
 		event->pending_disable = 0;
 		event->state = PERF_EVENT_STATE_OFF;
 	}
-	event->tstamp_stopped = tstamp;
-	event->pmu->del(event, 0);
-	event->oncpu = -1;
 
 	if (!is_software_event(event))
 		cpuctx->active_oncpu--;
-- 
cgit v1.2.3


From 3cbaa59069677920186dcf502632ca1df4329f80 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 Feb 2016 18:45:47 +0100
Subject: perf: Fix ctx time tracking by introducing EVENT_TIME

Currently any ctx_sched_in() call will re-start the ctx time tracking,
this means that calls like:

	ctx_sched_in(.event_type = EVENT_PINNED);
	ctx_sched_in(.event_type = EVENT_FLEXIBLE);

will have a hole in their ctx time tracking. This is likely harmless
but can confuse things a little. By adding EVENT_TIME, we can have the
first ctx_sched_in() (is_active: 0 -> !0) start the time and any
further ctx_sched_in() will leave the timestamps alone.

Secondly, this allows for an early disable like:

	ctx_sched_out(.event_type = EVENT_TIME);

which would update the ctx time (if the ctx is active) and any further
calls to ctx_sched_out() would not further modify the ctx time.

For ctx_sched_in() any 0 -> !0 transition will automatically include
EVENT_TIME.

For ctx_sched_out(), any transition that clears EVENT_ALL will
automatically clear EVENT_TIME.

These two rules ensure that under normal circumstances we need not
bother with EVENT_TIME and get natural ctx time behaviour.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: eranian@google.com
Cc: oleg@redhat.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160224174948.100446561@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 42 ++++++++++++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index de14b67a3b0b..75bde93eb76f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -314,6 +314,7 @@ again:
 enum event_type_t {
 	EVENT_FLEXIBLE = 0x1,
 	EVENT_PINNED = 0x2,
+	EVENT_TIME = 0x4,
 	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 };
 
@@ -1294,16 +1295,18 @@ static u64 perf_event_time(struct perf_event *event)
 
 /*
  * Update the total_time_enabled and total_time_running fields for a event.
- * The caller of this function needs to hold the ctx->lock.
  */
 static void update_event_times(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
 	u64 run_end;
 
+	lockdep_assert_held(&ctx->lock);
+
 	if (event->state < PERF_EVENT_STATE_INACTIVE ||
 	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
 		return;
+
 	/*
 	 * in cgroup mode, time_enabled represents
 	 * the time the event was enabled AND active
@@ -2349,24 +2352,33 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 	}
 
 	ctx->is_active &= ~event_type;
+	if (!(ctx->is_active & EVENT_ALL))
+		ctx->is_active = 0;
+
 	if (ctx->task) {
 		WARN_ON_ONCE(cpuctx->task_ctx != ctx);
 		if (!ctx->is_active)
 			cpuctx->task_ctx = NULL;
 	}
 
-	update_context_time(ctx);
-	update_cgrp_time_from_cpuctx(cpuctx);
-	if (!ctx->nr_active)
+	is_active ^= ctx->is_active; /* changed bits */
+
+	if (is_active & EVENT_TIME) {
+		/* update (and stop) ctx time */
+		update_context_time(ctx);
+		update_cgrp_time_from_cpuctx(cpuctx);
+	}
+
+	if (!ctx->nr_active || !(is_active & EVENT_ALL))
 		return;
 
 	perf_pmu_disable(ctx->pmu);
-	if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
+	if (is_active & EVENT_PINNED) {
 		list_for_each_entry(event, &ctx->pinned_groups, group_entry)
 			group_sched_out(event, cpuctx, ctx);
 	}
 
-	if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
+	if (is_active & EVENT_FLEXIBLE) {
 		list_for_each_entry(event, &ctx->flexible_groups, group_entry)
 			group_sched_out(event, cpuctx, ctx);
 	}
@@ -2740,7 +2752,7 @@ ctx_sched_in(struct perf_event_context *ctx,
 	if (likely(!ctx->nr_events))
 		return;
 
-	ctx->is_active |= event_type;
+	ctx->is_active |= (event_type | EVENT_TIME);
 	if (ctx->task) {
 		if (!is_active)
 			cpuctx->task_ctx = ctx;
@@ -2748,18 +2760,24 @@ ctx_sched_in(struct perf_event_context *ctx,
 			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
 	}
 
-	now = perf_clock();
-	ctx->timestamp = now;
-	perf_cgroup_set_timestamp(task, ctx);
+	is_active ^= ctx->is_active; /* changed bits */
+
+	if (is_active & EVENT_TIME) {
+		/* start ctx time */
+		now = perf_clock();
+		ctx->timestamp = now;
+		perf_cgroup_set_timestamp(task, ctx);
+	}
+
 	/*
 	 * First go through the list and put on any pinned groups
 	 * in order to give them the best chance of going on.
 	 */
-	if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
+	if (is_active & EVENT_PINNED)
 		ctx_pinned_sched_in(ctx, cpuctx);
 
 	/* Then walk through the lower prio flexible groups */
-	if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
+	if (is_active & EVENT_FLEXIBLE)
 		ctx_flexible_sched_in(ctx, cpuctx);
 }
 
-- 
cgit v1.2.3


From 7fce250915efca0f8f51dddee3ae89bf30d86ca5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 Feb 2016 18:45:48 +0100
Subject: perf: Fix scaling vs. perf_event_enable_on_exec()

The recent commit 3e349507d12d ("perf: Fix perf_enable_on_exec() event
scheduling") caused this by moving task_ctx_sched_out() from before
__perf_event_mask_enable() to after it.

The overlooked consequence of that change is that task_ctx_sched_out()
would update the ctx time fields, and now __perf_event_mask_enable()
uses stale time.

In order to fix this, explicitly stop our context's time before
enabling the event(s).

Reported-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: eranian@google.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Cc: vince@deater.net
Fixes: 3e349507d12d ("perf: Fix perf_enable_on_exec() event scheduling")
Link: http://lkml.kernel.org/r/20160224174948.159242158@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 75bde93eb76f..d0030886c402 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3143,6 +3143,7 @@ static void perf_event_enable_on_exec(int ctxn)
 
 	cpuctx = __get_cpu_context(ctx);
 	perf_ctx_lock(cpuctx, ctx);
+	ctx_sched_out(ctx, cpuctx, EVENT_TIME);
 	list_for_each_entry(event, &ctx->event_list, event_entry)
 		enabled |= event_enable_on_exec(event, ctx);
 
-- 
cgit v1.2.3


From bd2afa49d194c6412c333e9fdd48bc5d06bb465d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 Feb 2016 18:45:49 +0100
Subject: perf: Fix scaling vs. perf_event_enable()

Similar to the perf_enable_on_exec(), ensure that event timings are
consistent across perf_event_enable().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: eranian@google.com
Cc: oleg@redhat.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160224174948.218288698@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 42 +++++++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index d0030886c402..57c25faecfa5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2069,14 +2069,27 @@ static void add_event_to_ctx(struct perf_event *event,
 	event->tstamp_stopped = tstamp;
 }
 
-static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
-			       struct perf_event_context *ctx);
+static void ctx_sched_out(struct perf_event_context *ctx,
+			  struct perf_cpu_context *cpuctx,
+			  enum event_type_t event_type);
 static void
 ctx_sched_in(struct perf_event_context *ctx,
 	     struct perf_cpu_context *cpuctx,
 	     enum event_type_t event_type,
 	     struct task_struct *task);
 
+static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
+			       struct perf_event_context *ctx)
+{
+	if (!cpuctx->task_ctx)
+		return;
+
+	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
+		return;
+
+	ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+}
+
 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
 				struct perf_event_context *ctx,
 				struct task_struct *task)
@@ -2227,17 +2240,18 @@ static void __perf_event_enable(struct perf_event *event,
 	    event->state <= PERF_EVENT_STATE_ERROR)
 		return;
 
-	update_context_time(ctx);
+	if (ctx->is_active)
+		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+
 	__perf_event_mark_enabled(event);
 
 	if (!ctx->is_active)
 		return;
 
 	if (!event_filter_match(event)) {
-		if (is_cgroup_event(event)) {
-			perf_cgroup_set_timestamp(current, ctx); // XXX ?
+		if (is_cgroup_event(event))
 			perf_cgroup_defer_enabled(event);
-		}
+		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
 		return;
 	}
 
@@ -2245,8 +2259,10 @@ static void __perf_event_enable(struct perf_event *event,
 	 * If the event is in a group and isn't the group leader,
 	 * then don't put it on unless the group is on.
 	 */
-	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
+	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
+		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
 		return;
+	}
 
 	task_ctx = cpuctx->task_ctx;
 	if (ctx->task)
@@ -2658,18 +2674,6 @@ void __perf_event_task_sched_out(struct task_struct *task,
 		perf_cgroup_sched_out(task, next);
 }
 
-static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
-			       struct perf_event_context *ctx)
-{
-	if (!cpuctx->task_ctx)
-		return;
-
-	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
-		return;
-
-	ctx_sched_out(ctx, cpuctx, EVENT_ALL);
-}
-
 /*
  * Called with IRQs disabled
  */
-- 
cgit v1.2.3


From a096309bc4677f60caa8e93fcc613a55073c51d4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 Feb 2016 18:45:50 +0100
Subject: perf: Fix scaling vs. perf_install_in_context()

Completely reworks perf_install_in_context() (again!) in order to
ensure that there will be no ctx time hole between add_event_to_ctx()
and any potential ctx_sched_in().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: eranian@google.com
Cc: oleg@redhat.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160224174948.279399438@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 115 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 70 insertions(+), 45 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 57c25faecfa5..25edabd207de 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -276,10 +276,10 @@ static void event_function_call(struct perf_event *event, event_f func, void *da
 		return;
 	}
 
-again:
 	if (task == TASK_TOMBSTONE)
 		return;
 
+again:
 	if (!task_function_call(task, event_function, &efs))
 		return;
 
@@ -289,13 +289,15 @@ again:
 	 * a concurrent perf_event_context_sched_out().
 	 */
 	task = ctx->task;
-	if (task != TASK_TOMBSTONE) {
-		if (ctx->is_active) {
-			raw_spin_unlock_irq(&ctx->lock);
-			goto again;
-		}
-		func(event, NULL, ctx, data);
+	if (task == TASK_TOMBSTONE) {
+		raw_spin_unlock_irq(&ctx->lock);
+		return;
+	}
+	if (ctx->is_active) {
+		raw_spin_unlock_irq(&ctx->lock);
+		goto again;
 	}
+	func(event, NULL, ctx, data);
 	raw_spin_unlock_irq(&ctx->lock);
 }
 
@@ -2116,49 +2118,68 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
 /*
  * Cross CPU call to install and enable a performance event
  *
- * Must be called with ctx->mutex held
+ * Very similar to remote_function() + event_function() but cannot assume that
+ * things like ctx->is_active and cpuctx->task_ctx are set.
  */
 static int  __perf_install_in_context(void *info)
 {
-	struct perf_event_context *ctx = info;
+	struct perf_event *event = info;
+	struct perf_event_context *ctx = event->ctx;
 	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 	struct perf_event_context *task_ctx = cpuctx->task_ctx;
+	bool activate = true;
+	int ret = 0;
 
 	raw_spin_lock(&cpuctx->ctx.lock);
 	if (ctx->task) {
 		raw_spin_lock(&ctx->lock);
-		/*
-		 * If we hit the 'wrong' task, we've since scheduled and
-		 * everything should be sorted, nothing to do!
-		 */
 		task_ctx = ctx;
-		if (ctx->task != current)
+
+		/* If we're on the wrong CPU, try again */
+		if (task_cpu(ctx->task) != smp_processor_id()) {
+			ret = -ESRCH;
 			goto unlock;
+		}
 
 		/*
-		 * If task_ctx is set, it had better be to us.
+		 * If we're on the right CPU, see if the task we target is
+		 * current, if not we don't have to activate the ctx, a future
+		 * context switch will do that for us.
 		 */
-		WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx);
+		if (ctx->task != current)
+			activate = false;
+		else
+			WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx);
+
 	} else if (task_ctx) {
 		raw_spin_lock(&task_ctx->lock);
 	}
 
-	ctx_resched(cpuctx, task_ctx);
+	if (activate) {
+		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+		add_event_to_ctx(event, ctx);
+		ctx_resched(cpuctx, task_ctx);
+	} else {
+		add_event_to_ctx(event, ctx);
+	}
+
 unlock:
 	perf_ctx_unlock(cpuctx, task_ctx);
 
-	return 0;
+	return ret;
 }
 
 /*
- * Attach a performance event to a context
+ * Attach a performance event to a context.
+ *
+ * Very similar to event_function_call, see comment there.
  */
 static void
 perf_install_in_context(struct perf_event_context *ctx,
 			struct perf_event *event,
 			int cpu)
 {
-	struct task_struct *task = NULL;
+	struct task_struct *task = READ_ONCE(ctx->task);
 
 	lockdep_assert_held(&ctx->mutex);
 
@@ -2166,42 +2187,46 @@ perf_install_in_context(struct perf_event_context *ctx,
 	if (event->cpu != -1)
 		event->cpu = cpu;
 
+	if (!task) {
+		cpu_function_call(cpu, __perf_install_in_context, event);
+		return;
+	}
+
+	/*
+	 * Should not happen, we validate the ctx is still alive before calling.
+	 */
+	if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
+		return;
+
 	/*
 	 * Installing events is tricky because we cannot rely on ctx->is_active
 	 * to be set in case this is the nr_events 0 -> 1 transition.
-	 *
-	 * So what we do is we add the event to the list here, which will allow
-	 * a future context switch to DTRT and then send a racy IPI. If the IPI
-	 * fails to hit the right task, this means a context switch must have
-	 * happened and that will have taken care of business.
 	 */
-	raw_spin_lock_irq(&ctx->lock);
-	task = ctx->task;
-
+again:
 	/*
-	 * If between ctx = find_get_context() and mutex_lock(&ctx->mutex) the
-	 * ctx gets destroyed, we must not install an event into it.
-	 *
-	 * This is normally tested for after we acquire the mutex, so this is
-	 * a sanity check.
+	 * Cannot use task_function_call() because we need to run on the task's
+	 * CPU regardless of whether its current or not.
 	 */
+	if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event))
+		return;
+
+	raw_spin_lock_irq(&ctx->lock);
+	task = ctx->task;
 	if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
+		/*
+		 * Cannot happen because we already checked above (which also
+		 * cannot happen), and we hold ctx->mutex, which serializes us
+		 * against perf_event_exit_task_context().
+		 */
 		raw_spin_unlock_irq(&ctx->lock);
 		return;
 	}
-
-	if (ctx->is_active) {
-		update_context_time(ctx);
-		update_cgrp_time_from_event(event);
-	}
-
-	add_event_to_ctx(event, ctx);
 	raw_spin_unlock_irq(&ctx->lock);
-
-	if (task)
-		task_function_call(task, __perf_install_in_context, ctx);
-	else
-		cpu_function_call(cpu, __perf_install_in_context, ctx);
+	/*
+	 * Since !ctx->is_active doesn't mean anything, we must IPI
+	 * unconditionally.
+	 */
+	goto again;
 }
 
 /*
-- 
cgit v1.2.3


From 0da4cf3e0a68c97ef811569804616a811f786729 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 Feb 2016 18:45:51 +0100
Subject: perf: Robustify task_function_call()

Since there is no serialization between task_function_call() doing
task_curr() and the other CPU doing context switches, we could end
up not sending an IPI even if we had to.

And I'm not sure I still buy my own argument we're OK.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: eranian@google.com
Cc: oleg@redhat.com
Cc: panand@redhat.com
Cc: sasha.levin@oracle.com
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160224174948.340031200@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 25edabd207de..614614821f00 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -64,8 +64,17 @@ static void remote_function(void *data)
 	struct task_struct *p = tfc->p;
 
 	if (p) {
-		tfc->ret = -EAGAIN;
-		if (task_cpu(p) != smp_processor_id() || !task_curr(p))
+		/* -EAGAIN */
+		if (task_cpu(p) != smp_processor_id())
+			return;
+
+		/*
+		 * Now that we're on right CPU with IRQs disabled, we can test
+		 * if we hit the right task without races.
+		 */
+
+		tfc->ret = -ESRCH; /* No such (running) process */
+		if (p != current)
 			return;
 	}
 
@@ -92,13 +101,17 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info)
 		.p	= p,
 		.func	= func,
 		.info	= info,
-		.ret	= -ESRCH, /* No such (running) process */
+		.ret	= -EAGAIN,
 	};
+	int ret;
 
-	if (task_curr(p))
-		smp_call_function_single(task_cpu(p), remote_function, &data, 1);
+	do {
+		ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
+		if (!ret)
+			ret = data.ret;
+	} while (ret == -EAGAIN);
 
-	return data.ret;
+	return ret;
 }
 
 /**
@@ -169,19 +182,6 @@ static bool is_kernel_event(struct perf_event *event)
  *    rely on ctx->is_active and therefore cannot use event_function_call().
  *    See perf_install_in_context().
  *
- * This is because we need a ctx->lock serialized variable (ctx->is_active)
- * to reliably determine if a particular task/context is scheduled in. The
- * task_curr() use in task_function_call() is racy in that a remote context
- * switch is not a single atomic operation.
- *
- * As is, the situation is 'safe' because we set rq->curr before we do the
- * actual context switch. This means that task_curr() will fail early, but
- * we'll continue spinning on ctx->is_active until we've passed
- * perf_event_task_sched_out().
- *
- * Without this ctx->lock serialized variable we could have race where we find
- * the task (and hence the context) would not be active while in fact they are.
- *
  * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
  */
 
@@ -212,7 +212,7 @@ static int event_function(void *info)
 	 */
 	if (ctx->task) {
 		if (ctx->task != current) {
-			ret = -EAGAIN;
+			ret = -ESRCH;
 			goto unlock;
 		}
 
-- 
cgit v1.2.3