From bc54da2176cd38cedea767eff637229a191a2383 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 31 Aug 2015 17:13:55 +0200
Subject: sched/core: Remove unused argument from sched_class::task_move_group

The previous patches made the second argument go unused, remove it.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 97d276ff1edb..7c099e69fd5e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7721,7 +7721,7 @@ void sched_move_task(struct task_struct *tsk)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	if (tsk->sched_class->task_move_group)
-		tsk->sched_class->task_move_group(tsk, queued);
+		tsk->sched_class->task_move_group(tsk);
 	else
 #endif
 		set_task_rq(tsk, task_cpu(tsk));
-- 
cgit v1.2.3


From 446685e9bfa11174332fbb0b3218b37015fbf4ff Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@odin.com>
Date: Mon, 31 Aug 2015 15:12:56 +0300
Subject: sched/core: Delete PF_EXITING checks from cpu_cgroup_exit() callback

cgroup_exit() is not called from copy_process() after commit:

  e8604cb43690 ("cgroup: fix spurious lockdep warning in cgroup_exit()")

from do_exit(). So this check is useless and the comment is obsolete.

Signed-off-by: Kirill Tkhai <ktkhai@odin.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/55E444C8.3020402@odin.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7c099e69fd5e..37ab6f9f0d1e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8193,14 +8193,6 @@ static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
 			    struct cgroup_subsys_state *old_css,
 			    struct task_struct *task)
 {
-	/*
-	 * cgroup_exit() is called in the copy_process() failure path.
-	 * Ignore this case since the task hasn't ran yet, this avoids
-	 * trying to poke a half freed task state from generic code.
-	 */
-	if (!(task->flags & PF_EXITING))
-		return;
-
 	sched_move_task(task);
 }
 
-- 
cgit v1.2.3


From 78a9c54649ea220065aad9902460a1d137c7eafd Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Tue, 11 Aug 2015 16:30:11 +0530
Subject: sched/numa: Rename numabalancing_enabled to sched_numa_balancing

Simple rename of the 'numabalancing_enabled' variable to 'sched_numa_balancing'.
No functional changes.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1439290813-6683-2-git-send-email-srikar@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  | 6 +++---
 kernel/sched/fair.c  | 4 ++--
 kernel/sched/sched.h | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 37ab6f9f0d1e..2656af00ea5e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2124,11 +2124,11 @@ void set_numabalancing_state(bool enabled)
 		sched_feat_set("NO_NUMA");
 }
 #else
-__read_mostly bool numabalancing_enabled;
+__read_mostly bool sched_numa_balancing;
 
 void set_numabalancing_state(bool enabled)
 {
-	numabalancing_enabled = enabled;
+	sched_numa_balancing = enabled;
 }
 #endif /* CONFIG_SCHED_DEBUG */
 
@@ -2138,7 +2138,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
 {
 	struct ctl_table t;
 	int err;
-	int state = numabalancing_enabled;
+	int state = sched_numa_balancing;
 
 	if (write && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 36774e5dab01..3a6ac5598bff 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2069,7 +2069,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 	int local = !!(flags & TNF_FAULT_LOCAL);
 	int priv;
 
-	if (!numabalancing_enabled)
+	if (!sched_numa_balancing)
 		return;
 
 	/* for example, ksmd faulting in a user's mm */
@@ -7874,7 +7874,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 		entity_tick(cfs_rq, se, queued);
 	}
 
-	if (numabalancing_enabled)
+	if (sched_numa_balancing)
 		task_tick_numa(rq, curr);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 637d5aeaa55a..d0b303d675d6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1006,13 +1006,13 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
 #ifdef CONFIG_NUMA_BALANCING
 #define sched_feat_numa(x) sched_feat(x)
 #ifdef CONFIG_SCHED_DEBUG
-#define numabalancing_enabled sched_feat_numa(NUMA)
+#define sched_numa_balancing sched_feat_numa(NUMA)
 #else
-extern bool numabalancing_enabled;
+extern bool sched_numa_balancing;
 #endif /* CONFIG_SCHED_DEBUG */
 #else
 #define sched_feat_numa(x) (0)
-#define numabalancing_enabled (0)
+#define sched_numa_balancing (0)
 #endif /* CONFIG_NUMA_BALANCING */
 
 static inline u64 global_rt_period(void)
-- 
cgit v1.2.3


From c3b9bc5bbfc3750570d788afffd431263ef695c6 Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Tue, 11 Aug 2015 16:30:12 +0530
Subject: sched/numa: Disable sched_numa_balancing on UMA systems

Commit 2a1ed24 ("sched/numa: Prefer NUMA hotness over cache hotness")
sets sched feature NUMA to true. However this can enable NUMA hinting
faults on a UMA system.

This commit ensures that NUMA hinting faults occur only on a NUMA system
by setting/resetting sched_numa_balancing.

This commit:

  - Makes sched_numa_balancing common to CONFIG_SCHED_DEBUG and
    !CONFIG_SCHED_DEBUG. Earlier it was only in !CONFIG_SCHED_DEBUG.

  - Checks for sched_numa_balancing instead of sched_feat(NUMA).

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1439290813-6683-3-git-send-email-srikar@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  | 14 +++++---------
 kernel/sched/fair.c  |  4 ++--
 kernel/sched/sched.h |  6 ------
 3 files changed, 7 insertions(+), 17 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2656af00ea5e..ca665f8bec98 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2115,22 +2115,18 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 }
 
 #ifdef CONFIG_NUMA_BALANCING
-#ifdef CONFIG_SCHED_DEBUG
+__read_mostly bool sched_numa_balancing;
+
 void set_numabalancing_state(bool enabled)
 {
+	sched_numa_balancing = enabled;
+#ifdef CONFIG_SCHED_DEBUG
 	if (enabled)
 		sched_feat_set("NUMA");
 	else
 		sched_feat_set("NO_NUMA");
-}
-#else
-__read_mostly bool sched_numa_balancing;
-
-void set_numabalancing_state(bool enabled)
-{
-	sched_numa_balancing = enabled;
-}
 #endif /* CONFIG_SCHED_DEBUG */
+}
 
 #ifdef CONFIG_PROC_SYSCTL
 int sysctl_numa_balancing(struct ctl_table *table, int write,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3a6ac5598bff..e8f0828f4137 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5562,10 +5562,10 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 	unsigned long src_faults, dst_faults;
 	int src_nid, dst_nid;
 
-	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+	if (!sched_numa_balancing)
 		return -1;
 
-	if (!sched_feat(NUMA))
+	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
 		return -1;
 
 	src_nid = cpu_to_node(env->src_cpu);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d0b303d675d6..0d8f885b215b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1004,14 +1004,8 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
 
 #ifdef CONFIG_NUMA_BALANCING
-#define sched_feat_numa(x) sched_feat(x)
-#ifdef CONFIG_SCHED_DEBUG
-#define sched_numa_balancing sched_feat_numa(NUMA)
-#else
 extern bool sched_numa_balancing;
-#endif /* CONFIG_SCHED_DEBUG */
 #else
-#define sched_feat_numa(x) (0)
 #define sched_numa_balancing (0)
 #endif /* CONFIG_NUMA_BALANCING */
 
-- 
cgit v1.2.3


From 2b49d84b259fc18e131026e5d38e7855352f71b9 Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Tue, 11 Aug 2015 16:30:13 +0530
Subject: sched/numa: Remove the NUMA sched_feature

Variable sched_numa_balancing is available for both CONFIG_SCHED_DEBUG
and !CONFIG_SCHED_DEBUG. All code paths now check for
sched_numa_balancing. Hence remove sched_feat(NUMA).

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1439290813-6683-4-git-send-email-srikar@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c     |  6 ------
 kernel/sched/features.h | 16 ----------------
 2 files changed, 22 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ca665f8bec98..e0bd88b26a27 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2120,12 +2120,6 @@ __read_mostly bool sched_numa_balancing;
 void set_numabalancing_state(bool enabled)
 {
 	sched_numa_balancing = enabled;
-#ifdef CONFIG_SCHED_DEBUG
-	if (enabled)
-		sched_feat_set("NUMA");
-	else
-		sched_feat_set("NO_NUMA");
-#endif /* CONFIG_SCHED_DEBUG */
 }
 
 #ifdef CONFIG_PROC_SYSCTL
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index e6fd23b7459b..edf5902d5e57 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -72,21 +72,5 @@ SCHED_FEAT(RT_PUSH_IPI, true)
 SCHED_FEAT(FORCE_SD_OVERLAP, false)
 SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
-
 SCHED_FEAT(ATTACH_AGE_LOAD, true)
 
-/*
- * Apply the automatic NUMA scheduling policy. Enabled automatically
- * at runtime if running on a NUMA machine. Can be controlled via
- * numa_balancing=
- */
-#ifdef CONFIG_NUMA_BALANCING
-
-/*
- * NUMA will favor moving tasks towards nodes where a higher number of
- * hinting faults are recorded during active load balancing. It will
- * resist moving tasks towards nodes where a lower number of hinting
- * faults have been recorded.
- */
-SCHED_FEAT(NUMA,	true)
-#endif
-- 
cgit v1.2.3


From 2a595721a1fa6b684c1c818f379bef834ac3d65e Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Tue, 11 Aug 2015 21:54:21 +0530
Subject: sched/numa: Convert sched_numa_balancing to a static_branch

Variable sched_numa_balancing toggles numa_balancing feature. Hence
moving from a simple read mostly variable to a more apt static_branch.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1439310261-16124-1-git-send-email-srikar@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  | 10 +++++++---
 kernel/sched/fair.c  |  6 +++---
 kernel/sched/sched.h |  6 +-----
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e0bd88b26a27..b62127118fc8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2114,12 +2114,16 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 #endif /* CONFIG_NUMA_BALANCING */
 }
 
+DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
+
 #ifdef CONFIG_NUMA_BALANCING
-__read_mostly bool sched_numa_balancing;
 
 void set_numabalancing_state(bool enabled)
 {
-	sched_numa_balancing = enabled;
+	if (enabled)
+		static_branch_enable(&sched_numa_balancing);
+	else
+		static_branch_disable(&sched_numa_balancing);
 }
 
 #ifdef CONFIG_PROC_SYSCTL
@@ -2128,7 +2132,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
 {
 	struct ctl_table t;
 	int err;
-	int state = sched_numa_balancing;
+	int state = static_branch_likely(&sched_numa_balancing);
 
 	if (write && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e8f0828f4137..47ece22e7ae1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2069,7 +2069,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 	int local = !!(flags & TNF_FAULT_LOCAL);
 	int priv;
 
-	if (!sched_numa_balancing)
+	if (!static_branch_likely(&sched_numa_balancing))
 		return;
 
 	/* for example, ksmd faulting in a user's mm */
@@ -5562,7 +5562,7 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 	unsigned long src_faults, dst_faults;
 	int src_nid, dst_nid;
 
-	if (!sched_numa_balancing)
+	if (!static_branch_likely(&sched_numa_balancing))
 		return -1;
 
 	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
@@ -7874,7 +7874,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 		entity_tick(cfs_rq, se, queued);
 	}
 
-	if (sched_numa_balancing)
+	if (!static_branch_unlikely(&sched_numa_balancing))
 		task_tick_numa(rq, curr);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0d8f885b215b..2e8530d02b02 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1003,11 +1003,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
 
-#ifdef CONFIG_NUMA_BALANCING
-extern bool sched_numa_balancing;
-#else
-#define sched_numa_balancing (0)
-#endif /* CONFIG_NUMA_BALANCING */
+extern struct static_key_false sched_numa_balancing;
 
 static inline u64 global_rt_period(void)
 {
-- 
cgit v1.2.3


From 98d8fd8126676f7ba6e133e65b2ca4b17989d32c Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Fri, 14 Aug 2015 17:23:14 +0100
Subject: sched/fair: Initialize task load and utilization before placing task
 on rq

Task load or utilization is not currently considered in
select_task_rq_fair(), but if we want that in the future we should make
sure it is not zero for new tasks.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Dietmar Eggemann <Dietmar.Eggemann@arm.com>
Cc: Juri Lelli <Juri.Lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: daniel.lezcano@linaro.org
Cc: mturquette@baylibre.com
Cc: pang.xunlei@zte.com.cn
Cc: rjw@rjwysocki.net
Cc: sgurrappadi@nvidia.com
Cc: vincent.guittot@linaro.org
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1439569394-11974-7-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b62127118fc8..6ab415aa15c4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2343,6 +2343,8 @@ void wake_up_new_task(struct task_struct *p)
 	struct rq *rq;
 
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	/* Initialize new task's runnable average */
+	init_entity_runnable_average(&p->se);
 #ifdef CONFIG_SMP
 	/*
 	 * Fork balancing, do it here and not earlier because:
@@ -2352,8 +2354,6 @@ void wake_up_new_task(struct task_struct *p)
 	set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
 
-	/* Initialize new task's runnable average */
-	init_entity_runnable_average(&p->se);
 	rq = __task_rq_lock(p);
 	activate_task(rq, p, 0);
 	p->on_rq = TASK_ON_RQ_QUEUED;
-- 
cgit v1.2.3


From de9b8f5dcbd94bfb1d249907a635f1fb1968e19c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 13 Aug 2015 23:09:29 +0200
Subject: sched: Fix crash trying to dequeue/enqueue the idle thread

Sasha reports that his virtual machine tries to schedule the idle
thread since commit 6c37067e2786 ("sched: Change the
sched_class::set_cpus_allowed() calling context").

Hit trace shows this happening from idle_thread_get()->init_idle(),
which is the _second_ init_idle() invocation on that task_struct, the
first being done through idle_init()->fork_idle(). (this code is
insane...)

Because we call init_idle() twice in a row, its ->sched_class ==
&idle_sched_class and ->on_rq = TASK_ON_RQ_QUEUED. This means
do_set_cpus_allowed() think we're queued and will call dequeue_task(),
which is implemented with BUG() for the idle class, seeing how
dequeueing the idle task is a daft thing.

Aside of the whole insanity of calling init_idle() _twice_, change the
code to call set_cpus_allowed_common() instead as this is 'obviously'
before the idle task gets ran etc..

Reported-by: Sasha Levin <sasha.levin@oracle.com>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 6c37067e2786 ("sched: Change the sched_class::set_cpus_allowed() calling context")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 97d276ff1edb..f0d043ec0182 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4927,7 +4927,15 @@ void init_idle(struct task_struct *idle, int cpu)
 	idle->state = TASK_RUNNING;
 	idle->se.exec_start = sched_clock();
 
-	do_set_cpus_allowed(idle, cpumask_of(cpu));
+#ifdef CONFIG_SMP
+	/*
+	 * Its possible that init_idle() gets called multiple times on a task,
+	 * in that case do_set_cpus_allowed() will not do the right thing.
+	 *
+	 * And since this is boot we can forgo the serialization.
+	 */
+	set_cpus_allowed_common(idle, cpumask_of(cpu));
+#endif
 	/*
 	 * We're having a chicken and egg problem, even though we are
 	 * holding rq->lock, the cpu isn't yet set to this cpu so the
@@ -4944,7 +4952,7 @@ void init_idle(struct task_struct *idle, int cpu)
 
 	rq->curr = rq->idle = idle;
 	idle->on_rq = TASK_ON_RQ_QUEUED;
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
 	idle->on_cpu = 1;
 #endif
 	raw_spin_unlock(&rq->lock);
@@ -4959,7 +4967,7 @@ void init_idle(struct task_struct *idle, int cpu)
 	idle->sched_class = &idle_sched_class;
 	ftrace_graph_init_idle_task(idle, cpu);
 	vtime_init_idle(idle, cpu);
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
 	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
 #endif
 }
-- 
cgit v1.2.3


From 20f9cd2acb1d74a8bf4b4087267f586e6ecdbc03 Mon Sep 17 00:00:00 2001
From: Henrik Austad <henrik@austad.us>
Date: Wed, 9 Sep 2015 17:00:41 +0200
Subject: sched/core: Make policy-testing consistent

Most of the policy-tests are done via the <class>_policy() helpers with
the notable exception of idle. A new wrapper for valid_policy() has also
been added to improve readability  in set_load_weight().

This commit does not change the logical behavior of the scheduler core.

Signed-off-by: Henrik Austad <henrik@austad.us>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1441810841-4756-1-git-send-email-henrik@austad.us
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  | 9 +++------
 kernel/sched/sched.h | 9 +++++++++
 2 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6ab415aa15c4..1b30b5b24a4a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -817,7 +817,7 @@ static void set_load_weight(struct task_struct *p)
 	/*
 	 * SCHED_IDLE tasks get minimal weight:
 	 */
-	if (p->policy == SCHED_IDLE) {
+	if (idle_policy(p->policy)) {
 		load->weight = scale_load(WEIGHT_IDLEPRIO);
 		load->inv_weight = WMULT_IDLEPRIO;
 		return;
@@ -3733,10 +3733,7 @@ recheck:
 	} else {
 		reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
 
-		if (policy != SCHED_DEADLINE &&
-				policy != SCHED_FIFO && policy != SCHED_RR &&
-				policy != SCHED_NORMAL && policy != SCHED_BATCH &&
-				policy != SCHED_IDLE)
+		if (!valid_policy(policy))
 			return -EINVAL;
 	}
 
@@ -3792,7 +3789,7 @@ recheck:
 		 * Treat SCHED_IDLE as nice 20. Only allow a switch to
 		 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
 		 */
-		if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
+		if (idle_policy(p->policy) && !idle_policy(policy)) {
 			if (!can_nice(p, task_nice(p)))
 				return -EPERM;
 		}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 167ab4844ee6..3845a711c65e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -84,6 +84,10 @@ static inline void update_cpu_load_active(struct rq *this_rq) { }
  */
 #define RUNTIME_INF	((u64)~0ULL)
 
+static inline int idle_policy(int policy)
+{
+	return policy == SCHED_IDLE;
+}
 static inline int fair_policy(int policy)
 {
 	return policy == SCHED_NORMAL || policy == SCHED_BATCH;
@@ -98,6 +102,11 @@ static inline int dl_policy(int policy)
 {
 	return policy == SCHED_DEADLINE;
 }
+static inline bool valid_policy(int policy)
+{
+	return idle_policy(policy) || fair_policy(policy) ||
+		rt_policy(policy) || dl_policy(policy);
+}
 
 static inline int task_has_rt_policy(struct task_struct *p)
 {
-- 
cgit v1.2.3


From c6e1e7b5b7f031910850ddaf7bfa65ba3b4843ea Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Tue, 22 Sep 2015 12:48:59 +0200
Subject: sched/core: Make 'sched_domain_topology' declaration static

The 'sched_domain_topology' variable is only used within kernel/sched/core.c.
Make it static.

Signed-off-by: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1442918939-9907-1-git-send-email-jgross@suse.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 2 --
 kernel/sched/core.c   | 3 ++-
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index bd38b3ee9e83..699228bb0035 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1127,8 +1127,6 @@ struct sched_domain_topology_level {
 #endif
 };
 
-extern struct sched_domain_topology_level *sched_domain_topology;
-
 extern void set_sched_topology(struct sched_domain_topology_level *tl);
 extern void wake_up_if_idle(int cpu);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1b30b5b24a4a..a91df6171f48 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6445,7 +6445,8 @@ static struct sched_domain_topology_level default_topology[] = {
 	{ NULL, },
 };
 
-struct sched_domain_topology_level *sched_domain_topology = default_topology;
+static struct sched_domain_topology_level *sched_domain_topology =
+	default_topology;
 
 #define for_each_sd_topology(tl)			\
 	for (tl = sched_domain_topology; tl->mask; tl++)
-- 
cgit v1.2.3


From 95913d97914f44db2b81271c2e2ebd4d2ac2df83 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 29 Sep 2015 14:45:09 +0200
Subject: sched/core: Fix TASK_DEAD race in finish_task_switch()

So the problem this patch is trying to address is as follows:

        CPU0                            CPU1

        context_switch(A, B)
                                        ttwu(A)
                                          LOCK A->pi_lock
                                          A->on_cpu == 0
        finish_task_switch(A)
          prev_state = A->state  <-.
          WMB                      |
          A->on_cpu = 0;           |
          UNLOCK rq0->lock         |
                                   |    context_switch(C, A)
                                   `--  A->state = TASK_DEAD
          prev_state == TASK_DEAD
            put_task_struct(A)
                                        context_switch(A, C)
                                        finish_task_switch(A)
                                          A->state == TASK_DEAD
                                            put_task_struct(A)

The argument being that the WMB will allow the load of A->state on CPU0
to cross over and observe CPU1's store of A->state, which will then
result in a double-drop and use-after-free.

Now the comment states (and this was true once upon a long time ago)
that we need to observe A->state while holding rq->lock because that
will order us against the wakeup; however the wakeup will not in fact
acquire (that) rq->lock; it takes A->pi_lock these days.

We can obviously fix this by upgrading the WMB to an MB, but that is
expensive, so we'd rather avoid that.

The alternative this patch takes is: smp_store_release(&A->on_cpu, 0),
which avoids the MB on some archs, but not important ones like ARM.

Reported-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: <stable@vger.kernel.org> # v3.1+
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Cc: manfred@colorfullife.com
Cc: will.deacon@arm.com
Fixes: e4a52bcb9a18 ("sched: Remove rq->lock from the first half of ttwu()")
Link: http://lkml.kernel.org/r/20150929124509.GG3816@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  | 10 +++++-----
 kernel/sched/sched.h |  5 +++--
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 615953141951..10a8faa1b0d4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2517,11 +2517,11 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
 	 * schedule one last time. The schedule call will never return, and
 	 * the scheduled task must drop that reference.
-	 * The test for TASK_DEAD must occur while the runqueue locks are
-	 * still held, otherwise prev could be scheduled on another cpu, die
-	 * there before we look at prev->state, and then the reference would
-	 * be dropped twice.
-	 *		Manfred Spraul <manfred@colorfullife.com>
+	 *
+	 * We must observe prev->state before clearing prev->on_cpu (in
+	 * finish_lock_switch), otherwise a concurrent wakeup can get prev
+	 * running on another CPU and we could rave with its RUNNING -> DEAD
+	 * transition, resulting in a double drop.
 	 */
 	prev_state = prev->state;
 	vtime_task_switch(prev);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 68cda117574c..6d2a119c7ad9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1078,9 +1078,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 	 * After ->on_cpu is cleared, the task can be moved to a different CPU.
 	 * We must ensure this doesn't happen until the switch is completely
 	 * finished.
+	 *
+	 * Pairs with the control dependency and rmb in try_to_wake_up().
 	 */
-	smp_wmb();
-	prev->on_cpu = 0;
+	smp_store_release(&prev->on_cpu, 0);
 #endif
 #ifdef CONFIG_DEBUG_SPINLOCK
 	/* this is a valid case when another task releases the spinlock */
-- 
cgit v1.2.3


From b99def8b961448f5b9a550dddeeb718e3975e7a6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 28 Sep 2015 18:02:03 +0200
Subject: sched/core: Rework TASK_DEAD preemption exception

TASK_DEAD is special in that the final schedule call from do_exit()
must be done with preemption disabled.

This means we end up scheduling with a preempt_count() higher than
usual (3 instead of the 'expected' 2).

Since future patches will want to rely on an invariant
preempt_count() value during schedule, fix this up.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 88a425443ff4..530fe8baa645 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2949,12 +2949,8 @@ static inline void schedule_debug(struct task_struct *prev)
 #ifdef CONFIG_SCHED_STACK_END_CHECK
 	BUG_ON(unlikely(task_stack_end_corrupted(prev)));
 #endif
-	/*
-	 * Test if we are atomic. Since do_exit() needs to call into
-	 * schedule() atomically, we ignore that path. Otherwise whine
-	 * if we are scheduling when we should not.
-	 */
-	if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
+
+	if (unlikely(in_atomic_preempt_off()))
 		__schedule_bug(prev);
 	rcu_sleep_check();
 
@@ -3053,6 +3049,17 @@ static void __sched __schedule(void)
 	rcu_note_context_switch();
 	prev = rq->curr;
 
+	/*
+	 * do_exit() calls schedule() with preemption disabled as an exception;
+	 * however we must fix that up, otherwise the next task will see an
+	 * inconsistent (higher) preempt count.
+	 *
+	 * It also avoids the below schedule_debug() test from complaining
+	 * about this.
+	 */
+	if (unlikely(prev->state == TASK_DEAD))
+		preempt_enable_no_resched_notrace();
+
 	schedule_debug(prev);
 
 	if (sched_feat(HRTICK))
-- 
cgit v1.2.3


From 609ca066386b2e64d4c0b0f55da327654962a0c9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 28 Sep 2015 17:52:18 +0200
Subject: sched/core: Create preempt_count invariant

Assuming units of PREEMPT_DISABLE_OFFSET for preempt_count() numbers.

Now that TASK_DEAD no longer results in preempt_count() == 3 during
scheduling, we will always call context_switch() with preempt_count()
== 2.

However, we don't always end up with preempt_count() == 2 in
finish_task_switch() because new tasks get created with
preempt_count() == 1.

Create FORK_PREEMPT_COUNT and set it to 2 and use that in the right
places. Note that we cannot use INIT_PREEMPT_COUNT as that serves
another purpose (boot).

After this, preempt_count() is invariant across the context switch,
with exception of PREEMPT_ACTIVE.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/preempt.h |  2 +-
 include/asm-generic/preempt.h  |  2 +-
 include/linux/sched.h          | 17 ++++++++++++-----
 kernel/sched/core.c            | 23 +++++++++++++++++++++--
 4 files changed, 35 insertions(+), 9 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index b12f81022a6b..01e700d392cb 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -31,7 +31,7 @@ static __always_inline void preempt_count_set(int pc)
  * must be macros to avoid header recursion hell
  */
 #define init_task_preempt_count(p) do { \
-	task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \
+	task_thread_info(p)->saved_preempt_count = FORK_PREEMPT_COUNT; \
 } while (0)
 
 #define init_idle_preempt_count(p, cpu) do { \
diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h
index 0bec580a4885..5d8ffa3e6f8c 100644
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -24,7 +24,7 @@ static __always_inline void preempt_count_set(int pc)
  * must be macros to avoid header recursion hell
  */
 #define init_task_preempt_count(p) do { \
-	task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \
+	task_thread_info(p)->preempt_count = FORK_PREEMPT_COUNT; \
 } while (0)
 
 #define init_idle_preempt_count(p, cpu) do { \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e5b8cbc4b8d6..23ca455d9582 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -599,11 +599,7 @@ struct task_cputime_atomic {
 		.sum_exec_runtime = ATOMIC64_INIT(0),		\
 	}
 
-#ifdef CONFIG_PREEMPT_COUNT
-#define PREEMPT_DISABLED	(1 + PREEMPT_ENABLED)
-#else
-#define PREEMPT_DISABLED	PREEMPT_ENABLED
-#endif
+#define PREEMPT_DISABLED	(PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
 
 /*
  * Disable preemption until the scheduler is running -- use an unconditional
@@ -613,6 +609,17 @@ struct task_cputime_atomic {
  */
 #define INIT_PREEMPT_COUNT	PREEMPT_OFFSET
 
+/*
+ * Initial preempt_count value; reflects the preempt_count schedule invariant
+ * which states that during context switches:
+ *
+ *    preempt_count() == 2*PREEMPT_DISABLE_OFFSET
+ *
+ * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
+ * Note: See finish_task_switch().
+ */
+#define FORK_PREEMPT_COUNT	(2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
+
 /**
  * struct thread_group_cputimer - thread group interval timer counts
  * @cputime_atomic:	atomic thread group interval timers.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 530fe8baa645..8d8722b84dee 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2504,6 +2504,18 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	struct mm_struct *mm = rq->prev_mm;
 	long prev_state;
 
+	/*
+	 * The previous task will have left us with a preempt_count of 2
+	 * because it left us after:
+	 *
+	 *	schedule()
+	 *	  preempt_disable();			// 1
+	 *	  __schedule()
+	 *	    raw_spin_lock_irq(&rq->lock)	// 2
+	 *
+	 * Also, see FORK_PREEMPT_COUNT.
+	 */
+
 	rq->prev_mm = NULL;
 
 	/*
@@ -2588,8 +2600,15 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 {
 	struct rq *rq;
 
-	/* finish_task_switch() drops rq->lock and enables preemtion */
-	preempt_disable();
+	/*
+	 * New tasks start with FORK_PREEMPT_COUNT, see there and
+	 * finish_task_switch() for details.
+	 *
+	 * finish_task_switch() will drop rq->lock() and lower preempt_count
+	 * and the preempt_enable() will end up enabling preemption (on
+	 * PREEMPT_COUNT kernels).
+	 */
+
 	rq = finish_task_switch(prev);
 	balance_callback(rq);
 	preempt_enable();
-- 
cgit v1.2.3


From fc13aebab7d8f0d19d557c721a0f25cdf7ae905c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 28 Sep 2015 18:05:34 +0200
Subject: sched/core: Add preempt argument to __schedule()

There is only a single PREEMPT_ACTIVE use in the regular __schedule()
path and that is to circumvent the task->state check. Since the code
setting PREEMPT_ACTIVE is the immediate caller of __schedule() we can
replace this with a function argument.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8d8722b84dee..0a71f89fb3fc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3056,7 +3056,7 @@ again:
  *
  * WARNING: must be called with preemption disabled!
  */
-static void __sched __schedule(void)
+static void __sched __schedule(bool preempt)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
@@ -3096,7 +3096,7 @@ static void __sched __schedule(void)
 	rq->clock_skip_update <<= 1; /* promote REQ to ACT */
 
 	switch_count = &prev->nivcsw;
-	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+	if (!preempt && prev->state) {
 		if (unlikely(signal_pending_state(prev->state, prev))) {
 			prev->state = TASK_RUNNING;
 		} else {
@@ -3161,7 +3161,7 @@ asmlinkage __visible void __sched schedule(void)
 	sched_submit_work(tsk);
 	do {
 		preempt_disable();
-		__schedule();
+		__schedule(false);
 		sched_preempt_enable_no_resched();
 	} while (need_resched());
 }
@@ -3202,7 +3202,7 @@ static void __sched notrace preempt_schedule_common(void)
 {
 	do {
 		preempt_active_enter();
-		__schedule();
+		__schedule(true);
 		preempt_active_exit();
 
 		/*
@@ -3267,7 +3267,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
 		 * an infinite recursion.
 		 */
 		prev_ctx = exception_enter();
-		__schedule();
+		__schedule(true);
 		exception_exit(prev_ctx);
 
 		barrier();
@@ -3296,7 +3296,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
 	do {
 		preempt_active_enter();
 		local_irq_enable();
-		__schedule();
+		__schedule(true);
 		local_irq_disable();
 		preempt_active_exit();
 	} while (need_resched());
-- 
cgit v1.2.3


From c73464b1c8434ad4cbfd5369c3e724f3e8ffe5a4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 28 Sep 2015 18:06:56 +0200
Subject: sched/core: Fix trace_sched_switch()

__trace_sched_switch_state() is the last remaining PREEMPT_ACTIVE
user, move trace_sched_switch() from prepare_task_switch() to
__schedule() and propagate the @preempt argument.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/trace/events/sched.h      | 22 +++++++++-------------
 kernel/sched/core.c               |  2 +-
 kernel/trace/ftrace.c             |  2 +-
 kernel/trace/trace_sched_switch.c |  3 ++-
 kernel/trace/trace_sched_wakeup.c |  2 +-
 5 files changed, 14 insertions(+), 17 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 539d6bc3216a..9b90c57517a9 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -104,22 +104,17 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
 	     TP_ARGS(p));
 
 #ifdef CREATE_TRACE_POINTS
-static inline long __trace_sched_switch_state(struct task_struct *p)
+static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p)
 {
-	long state = p->state;
-
-#ifdef CONFIG_PREEMPT
 #ifdef CONFIG_SCHED_DEBUG
 	BUG_ON(p != current);
 #endif /* CONFIG_SCHED_DEBUG */
+
 	/*
-	 * For all intents and purposes a preempted task is a running task.
+	 * Preemption ignores task state, therefore preempted tasks are always
+	 * RUNNING (we will not have dequeued if state != RUNNING).
 	 */
-	if (preempt_count() & PREEMPT_ACTIVE)
-		state = TASK_RUNNING | TASK_STATE_MAX;
-#endif /* CONFIG_PREEMPT */
-
-	return state;
+	return preempt ? TASK_RUNNING | TASK_STATE_MAX : p->state;
 }
 #endif /* CREATE_TRACE_POINTS */
 
@@ -128,10 +123,11 @@ static inline long __trace_sched_switch_state(struct task_struct *p)
  */
 TRACE_EVENT(sched_switch,
 
-	TP_PROTO(struct task_struct *prev,
+	TP_PROTO(bool preempt,
+		 struct task_struct *prev,
 		 struct task_struct *next),
 
-	TP_ARGS(prev, next),
+	TP_ARGS(preempt, prev, next),
 
 	TP_STRUCT__entry(
 		__array(	char,	prev_comm,	TASK_COMM_LEN	)
@@ -147,7 +143,7 @@ TRACE_EVENT(sched_switch,
 		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
 		__entry->prev_pid	= prev->pid;
 		__entry->prev_prio	= prev->prio;
-		__entry->prev_state	= __trace_sched_switch_state(prev);
+		__entry->prev_state	= __trace_sched_switch_state(preempt, prev);
 		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
 		__entry->next_pid	= next->pid;
 		__entry->next_prio	= next->prio;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0a71f89fb3fc..cfad7f5f74f8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2470,7 +2470,6 @@ static inline void
 prepare_task_switch(struct rq *rq, struct task_struct *prev,
 		    struct task_struct *next)
 {
-	trace_sched_switch(prev, next);
 	sched_info_switch(rq, prev, next);
 	perf_event_task_sched_out(prev, next);
 	fire_sched_out_preempt_notifiers(prev, next);
@@ -3132,6 +3131,7 @@ static void __sched __schedule(bool preempt)
 		rq->curr = next;
 		++*switch_count;
 
+		trace_sched_switch(preempt, prev, next);
 		rq = context_switch(rq, prev, next); /* unlocks the rq */
 		cpu = cpu_of(rq);
 	} else {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b0623ac785a2..00611e95a8ee 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5697,7 +5697,7 @@ free:
 }
 
 static void
-ftrace_graph_probe_sched_switch(void *ignore,
+ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
 			struct task_struct *prev, struct task_struct *next)
 {
 	unsigned long long timestamp;
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index f270088e9929..4c896a0101bd 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -16,7 +16,8 @@ static int			sched_ref;
 static DEFINE_MUTEX(sched_register_mutex);
 
 static void
-probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
+probe_sched_switch(void *ignore, bool preempt,
+		   struct task_struct *prev, struct task_struct *next)
 {
 	if (unlikely(!sched_ref))
 		return;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 12cbe77b4136..4bcfbac289ff 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -420,7 +420,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 }
 
 static void notrace
-probe_wakeup_sched_switch(void *ignore,
+probe_wakeup_sched_switch(void *ignore, bool preempt,
 			  struct task_struct *prev, struct task_struct *next)
 {
 	struct trace_array_cpu *data;
-- 
cgit v1.2.3


From 3d8f74dd4ca1da8a1a464bbafcf679e40c2fc10f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 28 Sep 2015 18:09:19 +0200
Subject: sched/core: Stop setting PREEMPT_ACTIVE

Now that nothing tests for PREEMPT_ACTIVE anymore, stop setting it.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/preempt.h | 12 ------------
 kernel/sched/core.c     | 19 ++++++-------------
 2 files changed, 6 insertions(+), 25 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index bea8dd8ff5e0..448dfd0b2ea6 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -146,18 +146,6 @@ extern void preempt_count_sub(int val);
 #define preempt_count_inc() preempt_count_add(1)
 #define preempt_count_dec() preempt_count_sub(1)
 
-#define preempt_active_enter() \
-do { \
-	preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
-	barrier(); \
-} while (0)
-
-#define preempt_active_exit() \
-do { \
-	barrier(); \
-	preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
-} while (0)
-
 #ifdef CONFIG_PREEMPT_COUNT
 
 #define preempt_disable() \
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cfad7f5f74f8..6344d82a84f6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3201,9 +3201,9 @@ void __sched schedule_preempt_disabled(void)
 static void __sched notrace preempt_schedule_common(void)
 {
 	do {
-		preempt_active_enter();
+		preempt_disable();
 		__schedule(true);
-		preempt_active_exit();
+		sched_preempt_enable_no_resched();
 
 		/*
 		 * Check again in case we missed a preemption opportunity
@@ -3254,13 +3254,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
 		return;
 
 	do {
-		/*
-		 * Use raw __prempt_count() ops that don't call function.
-		 * We can't call functions before disabling preemption which
-		 * disarm preemption tracing recursions.
-		 */
-		__preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
-		barrier();
+		preempt_disable_notrace();
 		/*
 		 * Needs preempt disabled in case user_exit() is traced
 		 * and the tracer calls preempt_enable_notrace() causing
@@ -3270,8 +3264,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
 		__schedule(true);
 		exception_exit(prev_ctx);
 
-		barrier();
-		__preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
+		preempt_enable_no_resched_notrace();
 	} while (need_resched());
 }
 EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
@@ -3294,11 +3287,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
 	prev_state = exception_enter();
 
 	do {
-		preempt_active_enter();
+		preempt_disable();
 		local_irq_enable();
 		__schedule(true);
 		local_irq_disable();
-		preempt_active_exit();
+		sched_preempt_enable_no_resched();
 	} while (need_resched());
 
 	exception_exit(prev_state);
-- 
cgit v1.2.3


From 1dc0fffc48af94513e621f95dff730ed4f7317ec Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 28 Sep 2015 17:57:39 +0200
Subject: sched/core: Robustify preemption leak checks

When we warn about a preempt_count leak; reset the preempt_count to
the known good value such that the problem does not ripple forward.

This is most important on x86 which has a per cpu preempt_count that is
not saved/restored (after this series). So if you schedule with an
invalid (!2*PREEMPT_DISABLE_OFFSET) preempt_count the next task is
messed up too.

Enforcing this invariant limits the borkage to just the one task.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/exit.c       | 4 +++-
 kernel/sched/core.c | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index ea95ee1b5ef7..443677c8efe6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -706,10 +706,12 @@ void do_exit(long code)
 	smp_mb();
 	raw_spin_unlock_wait(&tsk->pi_lock);
 
-	if (unlikely(in_atomic()))
+	if (unlikely(in_atomic())) {
 		pr_info("note: %s[%d] exited with preempt_count %d\n",
 			current->comm, task_pid_nr(current),
 			preempt_count());
+		preempt_count_set(PREEMPT_ENABLED);
+	}
 
 	/* sync mm's RSS info before statistics gathering */
 	if (tsk->mm)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6344d82a84f6..d6989f85c641 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2968,8 +2968,10 @@ static inline void schedule_debug(struct task_struct *prev)
 	BUG_ON(unlikely(task_stack_end_corrupted(prev)));
 #endif
 
-	if (unlikely(in_atomic_preempt_off()))
+	if (unlikely(in_atomic_preempt_off())) {
 		__schedule_bug(prev);
+		preempt_count_set(PREEMPT_DISABLED);
+	}
 	rcu_sleep_check();
 
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
-- 
cgit v1.2.3


From da7142e2ed735e1c1bef5a757dc55de35c65fbd6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 28 Sep 2015 18:11:45 +0200
Subject: sched/core: Simplify preempt_count tests

Since we stopped setting PREEMPT_ACTIVE, there is no need to mask it
out of preempt_count() tests.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/preempt.h | 3 +--
 kernel/sched/core.c     | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 448dfd0b2ea6..b2676a16cfbe 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -126,8 +126,7 @@
  * Check whether we were atomic before we did preempt_disable():
  * (used by the scheduler)
  */
-#define in_atomic_preempt_off() \
-		((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_DISABLE_OFFSET)
+#define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET)
 
 #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
 extern void preempt_count_add(int val);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d6989f85c641..ca260cc5d881 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7486,7 +7486,7 @@ void __init sched_init(void)
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 static inline int preempt_count_equals(int preempt_offset)
 {
-	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
+	int nested = preempt_count() + rcu_preempt_depth();
 
 	return (nested == preempt_offset);
 }
-- 
cgit v1.2.3


From 499d79559ffe4b9c0c3031752f6a40abd532fb75 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 28 Sep 2015 18:52:36 +0200
Subject: sched/core: More notrace annotations

preempt_schedule_common() is marked notrace, but it does not use
_notrace() preempt_count functions and __schedule() is also not marked
notrace, which means that its perfectly possible to end up in the
tracer from preempt_schedule_common().

Steve says:

  | Yep, there's some history to this. This was originally the issue that
  | caused function tracing to go into infinite recursion. But now we have
  | preempt_schedule_notrace(), which is used by the function tracer, and
  | that function must not be traced till preemption is disabled.
  |
  | Now if function tracing is running and we take an interrupt when
  | NEED_RESCHED is set, it calls
  |
  |   preempt_schedule_common() (not traced)
  |
  | But then that calls preempt_disable() (traced)
  |
  | function tracer calls preempt_disable_notrace() followed by
  | preempt_enable_notrace() which will see NEED_RESCHED set, and it will
  | call preempt_schedule_notrace(), which stops the recursion, but
  | still calls __schedule() here, and that means when we return, we call
  | the __schedule() from preempt_schedule_common().
  |
  | That said, I prefer this patch. Preemption is disabled before calling
  | __schedule(), and we get rid of a one round recursion with the
  | scheduler.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ca260cc5d881..98c4cf8182cf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3057,7 +3057,7 @@ again:
  *
  * WARNING: must be called with preemption disabled!
  */
-static void __sched __schedule(bool preempt)
+static void __sched notrace __schedule(bool preempt)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
@@ -3203,9 +3203,9 @@ void __sched schedule_preempt_disabled(void)
 static void __sched notrace preempt_schedule_common(void)
 {
 	do {
-		preempt_disable();
+		preempt_disable_notrace();
 		__schedule(true);
-		sched_preempt_enable_no_resched();
+		preempt_enable_no_resched_notrace();
 
 		/*
 		 * Check again in case we missed a preemption opportunity
-- 
cgit v1.2.3


From e2bf1c4b17aff25f07e0d2952d8c1c66643f33fe Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 29 Sep 2015 12:18:46 +0200
Subject: sched/core: Add preempt_count invariant check

Ingo requested I keep my debug check for the preempt_count invariant.

Requested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 98c4cf8182cf..4554cde2f6f9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2514,6 +2514,10 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	 *
 	 * Also, see FORK_PREEMPT_COUNT.
 	 */
+	if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
+		      "corrupted preempt_count: %s/%d/0x%x\n",
+		      current->comm, current->pid, preempt_count()))
+		preempt_count_set(FORK_PREEMPT_COUNT);
 
 	rq->prev_mm = NULL;
 
-- 
cgit v1.2.3


From 1de64443d755f83af8ba8b558fded0c61afaef47 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 30 Sep 2015 17:44:13 +0200
Subject: sched/core: Fix task and run queue sched_info::run_delay
 inconsistencies

Mike Meyer reported the following bug:

> During evaluation of some performance data, it was discovered thread
> and run queue run_delay accounting data was inconsistent with the other
> accounting data that was collected.  Further investigation found under
> certain circumstances execution time was leaking into the task and
> run queue accounting of run_delay.
>
> Consider the following sequence:
>
>     a. thread is running.
>     b. thread moves beween cgroups, changes scheduling class or priority.
>     c. thread sleeps OR
>     d. thread involuntarily gives up cpu.
>
> a. implies:
>
>     thread->sched_info.last_queued = 0
>
> a. and b. results in the following:
>
>     1. dequeue_task(rq, thread)
>
>            sched_info_dequeued(rq, thread)
>                delta = 0
>
>                sched_info_reset_dequeued(thread)
>                    thread->sched_info.last_queued = 0
>
>                thread->sched_info.run_delay += delta
>
>     2. enqueue_task(rq, thread)
>
>            sched_info_queued(rq, thread)
>
>                /* thread is still on cpu at this point. */
>                thread->sched_info.last_queued = task_rq(thread)->clock;
>
> c. results in:
>
>     dequeue_task(rq, thread)
>
>         sched_info_dequeued(rq, thread)
>
>             /* delta is execution time not run_delay. */
>             delta = task_rq(thread)->clock - thread->sched_info.last_queued
>
>         sched_info_reset_dequeued(thread)
>             thread->sched_info.last_queued = 0
>
>         thread->sched_info.run_delay += delta
>
>     Since thread was running between enqueue_task(rq, thread) and
>     dequeue_task(rq, thread), the delta above is really execution
>     time and not run_delay.
>
> d. results in:
>
>     __sched_info_switch(thread, next_thread)
>
>         sched_info_depart(rq, thread)
>
>             sched_info_queued(rq, thread)
>
>                 /* last_queued not updated due to being non-zero */
>                 return
>
>     Since thread was running between enqueue_task(rq, thread) and
>     __sched_info_switch(thread, next_thread), the execution time
>     between enqueue_task(rq, thread) and
>     __sched_info_switch(thread, next_thread) now will become
>     associated with run_delay due to when last_queued was last updated.
>

This alternative patch solves the problem by not calling
sched_info_{de,}queued() in {de,en}queue_task(). Therefore the
sched_info state is preserved and things work as expected.

By inlining the {de,en}queue_task() functions the new condition
becomes (mostly) a compile-time constant and we'll not emit any new
branch instructions.

It even shrinks the code (due to inlining {en,de}queue_task()):

$ size defconfig-build/kernel/sched/core.o defconfig-build/kernel/sched/core.o.orig
   text    data     bss     dec     hex filename
  64019   23378    2344   89741   15e8d defconfig-build/kernel/sched/core.o
  64149   23378    2344   89871   15f0f defconfig-build/kernel/sched/core.o.orig

Reported-by: Mike Meyer <Mike.Meyer@Teradata.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20150930154413.GO3604@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  | 44 +++++++++++++++++++++++++-------------------
 kernel/sched/sched.h | 14 ++++++++------
 2 files changed, 33 insertions(+), 25 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4554cde2f6f9..fb14a010426f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -827,17 +827,19 @@ static void set_load_weight(struct task_struct *p)
 	load->inv_weight = prio_to_wmult[prio];
 }
 
-static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_rq_clock(rq);
-	sched_info_queued(rq, p);
+	if (!(flags & ENQUEUE_RESTORE))
+		sched_info_queued(rq, p);
 	p->sched_class->enqueue_task(rq, p, flags);
 }
 
-static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_rq_clock(rq);
-	sched_info_dequeued(rq, p);
+	if (!(flags & DEQUEUE_SAVE))
+		sched_info_dequeued(rq, p);
 	p->sched_class->dequeue_task(rq, p, flags);
 }
 
@@ -1178,7 +1180,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 		 * holding rq->lock.
 		 */
 		lockdep_assert_held(&rq->lock);
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_SAVE);
 	}
 	if (running)
 		put_prev_task(rq, p);
@@ -1188,7 +1190,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, p, 0);
+		enqueue_task(rq, p, ENQUEUE_RESTORE);
 }
 
 /*
@@ -1692,7 +1694,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 #endif /* CONFIG_SCHEDSTATS */
 }
 
-static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
 {
 	activate_task(rq, p, en_flags);
 	p->on_rq = TASK_ON_RQ_QUEUED;
@@ -3325,7 +3327,7 @@ EXPORT_SYMBOL(default_wake_function);
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-	int oldprio, queued, running, enqueue_flag = 0;
+	int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
 	struct rq *rq;
 	const struct sched_class *prev_class;
 
@@ -3357,7 +3359,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_SAVE);
 	if (running)
 		put_prev_task(rq, p);
 
@@ -3375,7 +3377,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		if (!dl_prio(p->normal_prio) ||
 		    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
 			p->dl.dl_boosted = 1;
-			enqueue_flag = ENQUEUE_REPLENISH;
+			enqueue_flag |= ENQUEUE_REPLENISH;
 		} else
 			p->dl.dl_boosted = 0;
 		p->sched_class = &dl_sched_class;
@@ -3383,7 +3385,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		if (dl_prio(oldprio))
 			p->dl.dl_boosted = 0;
 		if (oldprio < prio)
-			enqueue_flag = ENQUEUE_HEAD;
+			enqueue_flag |= ENQUEUE_HEAD;
 		p->sched_class = &rt_sched_class;
 	} else {
 		if (dl_prio(oldprio))
@@ -3435,7 +3437,7 @@ void set_user_nice(struct task_struct *p, long nice)
 	}
 	queued = task_on_rq_queued(p);
 	if (queued)
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_SAVE);
 
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
@@ -3444,7 +3446,7 @@ void set_user_nice(struct task_struct *p, long nice)
 	delta = p->prio - old_prio;
 
 	if (queued) {
-		enqueue_task(rq, p, 0);
+		enqueue_task(rq, p, ENQUEUE_RESTORE);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
@@ -3946,7 +3948,7 @@ change:
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_SAVE);
 	if (running)
 		put_prev_task(rq, p);
 
@@ -3956,11 +3958,15 @@ change:
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued) {
+		int enqueue_flags = ENQUEUE_RESTORE;
 		/*
 		 * We enqueue to tail when the priority of a task is
 		 * increased (user space view).
 		 */
-		enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+		if (oldprio <= p->prio)
+			enqueue_flags |= ENQUEUE_HEAD;
+
+		enqueue_task(rq, p, enqueue_flags);
 	}
 
 	check_class_changed(rq, p, prev_class, oldprio);
@@ -5109,7 +5115,7 @@ void sched_setnuma(struct task_struct *p, int nid)
 	running = task_current(rq, p);
 
 	if (queued)
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_SAVE);
 	if (running)
 		put_prev_task(rq, p);
 
@@ -5118,7 +5124,7 @@ void sched_setnuma(struct task_struct *p, int nid)
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, p, 0);
+		enqueue_task(rq, p, ENQUEUE_RESTORE);
 	task_rq_unlock(rq, p, &flags);
 }
 #endif /* CONFIG_NUMA_BALANCING */
@@ -7737,7 +7743,7 @@ void sched_move_task(struct task_struct *tsk)
 	queued = task_on_rq_queued(tsk);
 
 	if (queued)
-		dequeue_task(rq, tsk, 0);
+		dequeue_task(rq, tsk, DEQUEUE_SAVE);
 	if (unlikely(running))
 		put_prev_task(rq, tsk);
 
@@ -7761,7 +7767,7 @@ void sched_move_task(struct task_struct *tsk)
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, tsk, 0);
+		enqueue_task(rq, tsk, ENQUEUE_RESTORE);
 
 	task_rq_unlock(rq, tsk, &flags);
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 046242feff3a..e08cc4cf68e2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1151,16 +1151,18 @@ static const u32 prio_to_wmult[40] = {
  /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
 
-#define ENQUEUE_WAKEUP		1
-#define ENQUEUE_HEAD		2
+#define ENQUEUE_WAKEUP		0x01
+#define ENQUEUE_HEAD		0x02
 #ifdef CONFIG_SMP
-#define ENQUEUE_WAKING		4	/* sched_class::task_waking was called */
+#define ENQUEUE_WAKING		0x04	/* sched_class::task_waking was called */
 #else
-#define ENQUEUE_WAKING		0
+#define ENQUEUE_WAKING		0x00
 #endif
-#define ENQUEUE_REPLENISH	8
+#define ENQUEUE_REPLENISH	0x08
+#define ENQUEUE_RESTORE	0x10
 
-#define DEQUEUE_SLEEP		1
+#define DEQUEUE_SLEEP		0x01
+#define DEQUEUE_SAVE		0x02
 
 #define RETRY_TASK		((void *)-1UL)
 
-- 
cgit v1.2.3


From ce03e4137bb22fc560ad7a07cf4138ae2cd59f65 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@163.com>
Date: Mon, 5 Oct 2015 21:26:05 +0800
Subject: sched/core: Drop unlikely behind BUG_ON()

(1) For !CONFIG_BUG cases, the bug call is a no-op, so we couldn't care
    less and the change is ok.

(2) PPC and MIPS, which HAVE_ARCH_BUG_ON, do not rely on branch predictions
    as it seems to be pointless [1] and thus callers should not be trying to
    push an optimization in the first place.

(3) For CONFIG_BUG and !HAVE_ARCH_BUG_ON cases, BUG_ON() contains an
    unlikely compiler flag already.

Hence, we can drop unlikely behind BUG_ON().

  [1] http://lkml.iu.edu/hypermail/linux/kernel/1101.3/02289.html

Signed-off-by: Geliang Tang <geliangtang@163.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/6fa7125979f98bbeac26e268271769b6ca935c8d.1444051018.git.geliangtang@163.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fb14a010426f..a395db17ff4c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2971,7 +2971,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
 static inline void schedule_debug(struct task_struct *prev)
 {
 #ifdef CONFIG_SCHED_STACK_END_CHECK
-	BUG_ON(unlikely(task_stack_end_corrupted(prev)));
+	BUG_ON(task_stack_end_corrupted(prev));
 #endif
 
 	if (unlikely(in_atomic_preempt_off())) {
-- 
cgit v1.2.3


From 5a4fd0368517bc5b5399ef958f6d30cbff492918 Mon Sep 17 00:00:00 2001
From: "xiaofeng.yan" <yanxiaofeng@inspur.com>
Date: Wed, 23 Sep 2015 14:55:59 +0800
Subject: sched/core: Remove a parameter in the migrate_task_rq() function

The parameter "int next_cpu" in the following function is unused:

  migrate_task_rq(struct task_struct *p, int next_cpu)

Remove it.

Signed-off-by: xiaofeng.yan <yanxiaofeng@inspur.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1442991360-31945-1-git-send-email-yanxiaofeng@inspur.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  | 2 +-
 kernel/sched/fair.c  | 2 +-
 kernel/sched/sched.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a395db17ff4c..1764a0f2a75b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1294,7 +1294,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 
 	if (task_cpu(p) != new_cpu) {
 		if (p->sched_class->migrate_task_rq)
-			p->sched_class->migrate_task_rq(p, new_cpu);
+			p->sched_class->migrate_task_rq(p);
 		p->se.nr_migrations++;
 		perf_event_task_migrate(p);
 	}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3bdc3da7bc6a..700eb548315f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5009,7 +5009,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
  * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
  * other assumptions, including the state of rq->lock, should be made.
  */
-static void migrate_task_rq_fair(struct task_struct *p, int next_cpu)
+static void migrate_task_rq_fair(struct task_struct *p)
 {
 	/*
 	 * We are supposed to update the task to "current" time, then its up to date
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e08cc4cf68e2..efd3bfc7e347 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1190,7 +1190,7 @@ struct sched_class {
 
 #ifdef CONFIG_SMP
 	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
-	void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
+	void (*migrate_task_rq)(struct task_struct *p);
 
 	void (*task_waking) (struct task_struct *task);
 	void (*task_woken) (struct rq *this_rq, struct task_struct *task);
-- 
cgit v1.2.3


From 84778472e1b6a27a8931712c40e8cf31143c8f6c Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 2 Sep 2015 01:28:44 -0700
Subject: sched: Export sched_setscheduler_nocheck

The new locktorture rtmutex_lock tests exercise priority boosting, which
means that they need to set some tasks to real-time priority.  To do this,
they use sched_setscheduler_nocheck().  However, this is not exported to
modules, which results in the following error when building locktorture
as a module:

ERROR: "sched_setscheduler_nocheck" [kernel/locking/locktorture.ko] undefined!

This commit therefore adds an EXPORT_SYMBOL_GPL() to allow this function
to be invoked from locktorture when built as a module.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/sched/core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2f9c92884817..c4e607873d6f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4022,6 +4022,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy,
 {
 	return _sched_setscheduler(p, policy, param, false);
 }
+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
 
 static int
 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
-- 
cgit v1.2.3


From 2e91fa7f6d451e3ea9fec999065d2fd199691f9d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 15 Oct 2015 16:41:53 -0400
Subject: cgroup: keep zombies associated with their original cgroups

cgroup_exit() is called when a task exits and disassociates the
exiting task from its cgroups and half-attach it to the root cgroup.
This is unnecessary and undesirable.

No controller actually needs an exiting task to be disassociated with
non-root cgroups.  Both cpu and perf_event controllers update the
association to the root cgroup from their exit callbacks just to keep
consistent with the cgroup core behavior.

Also, this disassociation makes it difficult to track resources held
by zombies or determine where the zombies came from.  Currently, pids
controller is completely broken as it uncharges on exit and zombies
always escape the resource restriction.  With cgroup association being
reset on exit, fixing it is pretty painful.

There's no reason to reset cgroup membership on exit.  The zombie can
be removed from its css_set so that it doesn't show up on
"cgroup.procs" and thus can't be migrated or interfere with cgroup
removal.  It can still pin and point to the css_set so that its cgroup
membership is maintained.  This patch makes cgroup core keep zombies
associated with their cgroups at the time of exit.

* Previous patches decoupled populated_cnt tracking from css_set
  lifetime, so a dying task can be simply unlinked from its css_set
  while pinning and pointing to the css_set.  This keeps css_set
  association from task side alive while hiding it from "cgroup.procs"
  and populated_cnt tracking.  The css_set reference is dropped when
  the task_struct is freed.

* ->exit() callback no longer needs the css arguments as the
  associated css never changes once PF_EXITING is set.  Removed.

* cpu and perf_events controllers no longer need ->exit() callbacks.
  There's no reason to explicitly switch away on exit.  The final
  schedule out is enough.  The callbacks are removed.

* On traditional hierarchies, nothing changes.  "/proc/PID/cgroup"
  still reports "/" for all zombies.  On the default hierarchy,
  "/proc/PID/cgroup" keeps reporting the cgroup that the task belonged
  to at the time of exit.  If the cgroup gets removed before the task
  is reaped, " (deleted)" is appended.

v2: Build brekage due to missing dummy cgroup_free() when
    !CONFIG_CGROUP fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
---
 Documentation/cgroups/unified-hierarchy.txt |  4 +++
 include/linux/cgroup-defs.h                 |  4 +--
 include/linux/cgroup.h                      |  2 ++
 kernel/cgroup.c                             | 51 +++++++++++++++++++----------
 kernel/cgroup_pids.c                        |  6 ++--
 kernel/events/core.c                        | 16 ---------
 kernel/fork.c                               |  1 +
 kernel/sched/core.c                         | 16 ---------
 8 files changed, 44 insertions(+), 56 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt
index 176b940f8327..6932453d37a2 100644
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -374,6 +374,10 @@ supported and the interface files "release_agent" and
 
 - The "cgroup.clone_children" file is removed.
 
+- /proc/PID/cgroup keeps reporting the cgroup that a zombie belonged
+  to before exiting.  If the cgroup is removed before the zombie is
+  reaped, " (deleted)" is appeneded to the path.
+
 
 5-3. Controller File Conventions
 
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 62413c3e2f4b..6a1ab64ee5f9 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -435,9 +435,7 @@ struct cgroup_subsys {
 	int (*can_fork)(struct task_struct *task, void **priv_p);
 	void (*cancel_fork)(struct task_struct *task, void *priv);
 	void (*fork)(struct task_struct *task, void *priv);
-	void (*exit)(struct cgroup_subsys_state *css,
-		     struct cgroup_subsys_state *old_css,
-		     struct task_struct *task);
+	void (*exit)(struct task_struct *task);
 	void (*bind)(struct cgroup_subsys_state *root_css);
 
 	int early_init;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 46020735bcbb..22e3754f89c5 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -102,6 +102,7 @@ extern void cgroup_cancel_fork(struct task_struct *p,
 extern void cgroup_post_fork(struct task_struct *p,
 			     void *old_ss_priv[CGROUP_CANFORK_COUNT]);
 void cgroup_exit(struct task_struct *p);
+void cgroup_free(struct task_struct *p);
 
 int cgroup_init_early(void);
 int cgroup_init(void);
@@ -547,6 +548,7 @@ static inline void cgroup_cancel_fork(struct task_struct *p,
 static inline void cgroup_post_fork(struct task_struct *p,
 				    void *ss_priv[CGROUP_CANFORK_COUNT]) {}
 static inline void cgroup_exit(struct task_struct *p) {}
+static inline void cgroup_free(struct task_struct *p) {}
 
 static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ba7b3284c2e4..918658497625 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5379,14 +5379,34 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 			seq_printf(m, "%sname=%s", count ? "," : "",
 				   root->name);
 		seq_putc(m, ':');
+
 		cgrp = task_cgroup_from_root(tsk, root);
-		path = cgroup_path(cgrp, buf, PATH_MAX);
-		if (!path) {
-			retval = -ENAMETOOLONG;
-			goto out_unlock;
+
+		/*
+		 * On traditional hierarchies, all zombie tasks show up as
+		 * belonging to the root cgroup.  On the default hierarchy,
+		 * while a zombie doesn't show up in "cgroup.procs" and
+		 * thus can't be migrated, its /proc/PID/cgroup keeps
+		 * reporting the cgroup it belonged to before exiting.  If
+		 * the cgroup is removed before the zombie is reaped,
+		 * " (deleted)" is appended to the cgroup path.
+		 */
+		if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
+			path = cgroup_path(cgrp, buf, PATH_MAX);
+			if (!path) {
+				retval = -ENAMETOOLONG;
+				goto out_unlock;
+			}
+		} else {
+			path = "/";
 		}
+
 		seq_puts(m, path);
-		seq_putc(m, '\n');
+
+		if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
+			seq_puts(m, " (deleted)\n");
+		else
+			seq_putc(m, '\n');
 	}
 
 	retval = 0;
@@ -5593,7 +5613,6 @@ void cgroup_exit(struct task_struct *tsk)
 {
 	struct cgroup_subsys *ss;
 	struct css_set *cset;
-	bool put_cset = false;
 	int i;
 
 	/*
@@ -5606,22 +5625,20 @@ void cgroup_exit(struct task_struct *tsk)
 		spin_lock_bh(&css_set_lock);
 		css_set_move_task(tsk, cset, NULL, false);
 		spin_unlock_bh(&css_set_lock);
-		put_cset = true;
+	} else {
+		get_css_set(cset);
 	}
 
-	/* Reassign the task to the init_css_set. */
-	RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
-
 	/* see cgroup_post_fork() for details */
-	for_each_subsys_which(ss, i, &have_exit_callback) {
-		struct cgroup_subsys_state *old_css = cset->subsys[i];
-		struct cgroup_subsys_state *css = task_css(tsk, i);
+	for_each_subsys_which(ss, i, &have_exit_callback)
+		ss->exit(tsk);
+}
 
-		ss->exit(css, old_css, tsk);
-	}
+void cgroup_free(struct task_struct *task)
+{
+	struct css_set *cset = task_css_set(task);
 
-	if (put_cset)
-		put_css_set(cset);
+	put_css_set(cset);
 }
 
 static void check_for_release(struct cgroup *cgrp)
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index 806cd7693ac8..45f0856a61fe 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -266,11 +266,9 @@ static void pids_fork(struct task_struct *task, void *priv)
 	css_put(old_css);
 }
 
-static void pids_exit(struct cgroup_subsys_state *css,
-		      struct cgroup_subsys_state *old_css,
-		      struct task_struct *task)
+static void pids_exit(struct task_struct *task)
 {
-	struct pids_cgroup *pids = css_pids(old_css);
+	struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
 
 	pids_uncharge(pids, 1);
 }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f548f69c4299..e9874949c787 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9293,25 +9293,9 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css,
 		task_function_call(task, __perf_cgroup_move, task);
 }
 
-static void perf_cgroup_exit(struct cgroup_subsys_state *css,
-			     struct cgroup_subsys_state *old_css,
-			     struct task_struct *task)
-{
-	/*
-	 * cgroup_exit() is called in the copy_process() failure path.
-	 * Ignore this case since the task hasn't ran yet, this avoids
-	 * trying to poke a half freed task state from generic code.
-	 */
-	if (!(task->flags & PF_EXITING))
-		return;
-
-	task_function_call(task, __perf_cgroup_move, task);
-}
-
 struct cgroup_subsys perf_event_cgrp_subsys = {
 	.css_alloc	= perf_cgroup_css_alloc,
 	.css_free	= perf_cgroup_css_free,
-	.exit		= perf_cgroup_exit,
 	.attach		= perf_cgroup_attach,
 };
 #endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/fork.c b/kernel/fork.c
index 7d5f0f118a63..118743bb5964 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -251,6 +251,7 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+	cgroup_free(tsk);
 	task_numa_free(tsk);
 	security_task_free(tsk);
 	exit_creds(tsk);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3595403921bd..2cad9ba91036 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8163,21 +8163,6 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
 		sched_move_task(task);
 }
 
-static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
-			    struct cgroup_subsys_state *old_css,
-			    struct task_struct *task)
-{
-	/*
-	 * cgroup_exit() is called in the copy_process() failure path.
-	 * Ignore this case since the task hasn't ran yet, this avoids
-	 * trying to poke a half freed task state from generic code.
-	 */
-	if (!(task->flags & PF_EXITING))
-		return;
-
-	sched_move_task(task);
-}
-
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
 				struct cftype *cftype, u64 shareval)
@@ -8509,7 +8494,6 @@ struct cgroup_subsys cpu_cgrp_subsys = {
 	.fork		= cpu_cgroup_fork,
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
-	.exit		= cpu_cgroup_exit,
 	.legacy_cftypes	= cpu_files,
 	.early_init	= 1,
 };
-- 
cgit v1.2.3


From 0baabb385eb4bce699ddab0db015112be6cf1e6a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 12 Oct 2015 17:21:23 +0200
Subject: nohz: Revert "nohz: Set isolcpus when nohz_full is set"

This reverts:

  8cb9764fc88b ("nohz: Set isolcpus when nohz_full is set")

We assumed that full-nohz users always want scheduler isolation on full
dynticks CPUs, therefore we included full-nohz CPUs on cpu_isolated_map.

This means that tasks run by default on CPUs outside the nohz_full range
unless their affinity is explicity overwritten.

This suits pure isolation workloads but when the machine is needed to
run common workloads, the available sets of CPUs to run common tasks
becomes reduced.

We reach an extreme case when CONFIG_NO_HZ_FULL_ALL is enabled as it
leaves only CPU 0 for non-isolation tasks, which makes people think that
their supercomputer regressed to 90's UP - which is true in a sense.

Some full-nohz users appear to be interested in running normal workloads
either before or after an isolation workload. Full-nohz isn't optimized
toward normal workloads but it's still better than UP performance.

We are reaching a limitation in kernel presets here. Lets revert this
cpu_isolated_map inclusion and let userspace do its own scheduler
isolation using cpusets or explicit affinity settings.

Reported-by: Ingo Molnar <mingo@kernel.org>
Reported-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Link: http://lkml.kernel.org/r/1444663283-30068-1-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 10a8faa1b0d4..5bd7d60658d3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7238,9 +7238,6 @@ void __init sched_init_smp(void)
 	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
 	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
 
-	/* nohz_full won't take effect without isolating the cpus. */
-	tick_nohz_full_add_cpus_to(cpu_isolated_map);
-
 	sched_init_numa();
 
 	/*
-- 
cgit v1.2.3


From 07f06cb3b5f6bd21374a48dbefdb431d71d53974 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 9 Oct 2015 18:00:54 +0200
Subject: sched: Start stopper early

Ensure the stopper thread is active 'early', because the load balancer
pretty much assumes that its available. And when 'online && active' the
load-balancer is fully available.

Not only the numa balancing stop_two_cpus() caller relies on it, but
also the self migration stuff does, and at CPU_ONLINE time the cpu
really is 'free' to run anything.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: heiko.carstens@de.ibm.com
Link: http://lkml.kernel.org/r/20151009160054.GA10176@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/cpu.c        |  1 -
 kernel/sched/core.c | 12 +++++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6467521e1e15..c85df2775b73 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -475,7 +475,6 @@ static int smpboot_thread_call(struct notifier_block *nfb,
 
 	case CPU_DOWN_FAILED:
 	case CPU_ONLINE:
-		stop_machine_unpark(cpu);
 		smpboot_unpark_threads(cpu);
 		break;
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f45a7c70f264..7ee8caea1195 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5545,21 +5545,27 @@ static void set_cpu_rq_start_time(void)
 static int sched_cpu_active(struct notifier_block *nfb,
 				      unsigned long action, void *hcpu)
 {
+	int cpu = (long)hcpu;
+
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_STARTING:
 		set_cpu_rq_start_time();
 		return NOTIFY_OK;
+
 	case CPU_ONLINE:
 		/*
 		 * At this point a starting CPU has marked itself as online via
 		 * set_cpu_online(). But it might not yet have marked itself
 		 * as active, which is essential from here on.
-		 *
-		 * Thus, fall-through and help the starting CPU along.
 		 */
+		set_cpu_active(cpu, true);
+		stop_machine_unpark(cpu);
+		return NOTIFY_OK;
+
 	case CPU_DOWN_FAILED:
-		set_cpu_active((long)hcpu, true);
+		set_cpu_active(cpu, true);
 		return NOTIFY_OK;
+
 	default:
 		return NOTIFY_DONE;
 	}
-- 
cgit v1.2.3


From 62694cd51322262a9142e946915fc4783113ccff Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 9 Oct 2015 18:36:29 +0200
Subject: sched: Move cpu_active() tests from stop_two_cpus() into
 migrate_swap_stop()

The cpu_active() tests are not fundamentally part of stop_two_cpus(),
move then into the scheduler where they belong.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c   | 4 ++++
 kernel/stop_machine.c | 9 ---------
 2 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7ee8caea1195..a7b368e51f69 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1335,12 +1335,16 @@ static int migrate_swap_stop(void *data)
 	struct rq *src_rq, *dst_rq;
 	int ret = -EAGAIN;
 
+	if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
+		return -EAGAIN;
+
 	src_rq = cpu_rq(arg->src_cpu);
 	dst_rq = cpu_rq(arg->dst_cpu);
 
 	double_raw_lock(&arg->src_task->pi_lock,
 			&arg->dst_task->pi_lock);
 	double_rq_lock(src_rq, dst_rq);
+
 	if (task_cpu(arg->dst_task) != arg->dst_cpu)
 		goto unlock;
 
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index e5a09d2dc575..867bc20e1ef1 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -275,15 +275,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 	cpu_stop_init_done(&done, 2);
 	set_state(&msdata, MULTI_STOP_PREPARE);
 
-	/*
-	 * We do not want to migrate to inactive CPU. FIXME: move this
-	 * into migrate_swap_stop() callback.
-	 */
-	if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
-		preempt_enable();
-		return -ENOENT;
-	}
-
 	if (cpu1 > cpu2)
 		swap(cpu1, cpu2);
 	if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) {
-- 
cgit v1.2.3


From e73e85f0593832aa583b252f9a16cf90ed6d30fa Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 10 Oct 2015 20:53:15 +0200
Subject: sched: Don't scan all-offline ->cpus_allowed twice if !CONFIG_CPUSETS

If CONFIG_CPUSETS=n then "case cpuset" changes the state and runs
the already failed for_each_cpu() loop again for no reason.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: heiko.carstens@de.ibm.com
Link: http://lkml.kernel.org/r/20151010185315.GA24100@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a7b368e51f69..b4d263db52a6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1580,13 +1580,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 			goto out;
 		}
 
+		/* No more Mr. Nice Guy. */
 		switch (state) {
 		case cpuset:
-			/* No more Mr. Nice Guy. */
-			cpuset_cpus_allowed_fallback(p);
-			state = possible;
-			break;
-
+			if (IS_ENABLED(CONFIG_CPUSETS)) {
+				cpuset_cpus_allowed_fallback(p);
+				state = possible;
+				break;
+			}
+			/* fall-through */
 		case possible:
 			do_set_cpus_allowed(p, cpu_possible_mask);
 			state = fail;
-- 
cgit v1.2.3


From 0aaafaabfcba8aa991913cd3280a5dbf7f111a2a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 23 Oct 2015 11:50:08 +0200
Subject: sched/core: Add missing lockdep_unpin() annotations

Luca and Wanpeng reported two missing annotations that led to
false lockdep complaints. Add the missing annotations.

Reported-by: Luca Abeni <luca.abeni@unitn.it>
Reported-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: cbce1a686700 ("sched,lockdep: Employ lock pinning")
Link: http://lkml.kernel.org/r/20151023095008.GY17308@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c     | 9 ++++++++-
 kernel/sched/deadline.c | 9 ++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'kernel/sched/core.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5bd7d60658d3..bcd214e4b4d6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2366,8 +2366,15 @@ void wake_up_new_task(struct task_struct *p)
 	trace_sched_wakeup_new(p);
 	check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
-	if (p->sched_class->task_woken)
+	if (p->sched_class->task_woken) {
+		/*
+		 * Nothing relies on rq->lock after this, so its fine to
+		 * drop it.
+		 */
+		lockdep_unpin_lock(&rq->lock);
 		p->sched_class->task_woken(rq, p);
+		lockdep_pin_lock(&rq->lock);
+	}
 #endif
 	task_rq_unlock(rq, p, &flags);
 }
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 142df2668e5d..8b0a15e285f9 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -668,8 +668,15 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 	 * Queueing this task back might have overloaded rq, check if we need
 	 * to kick someone away.
 	 */
-	if (has_pushable_dl_tasks(rq))
+	if (has_pushable_dl_tasks(rq)) {
+		/*
+		 * Nothing relies on rq->lock after this, so its safe to drop
+		 * rq->lock.
+		 */
+		lockdep_unpin_lock(&rq->lock);
 		push_dl_task(rq);
+		lockdep_pin_lock(&rq->lock);
+	}
 #endif
 
 unlock:
-- 
cgit v1.2.3