From bc54da2176cd38cedea767eff637229a191a2383 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 31 Aug 2015 17:13:55 +0200 Subject: sched/core: Remove unused argument from sched_class::task_move_group The previous patches made the second argument go unused, remove it. Signed-off-by: Peter Zijlstra (Intel) Cc: Byungchul Park Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 97d276ff1edb..7c099e69fd5e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7721,7 +7721,7 @@ void sched_move_task(struct task_struct *tsk) #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_move_group) - tsk->sched_class->task_move_group(tsk, queued); + tsk->sched_class->task_move_group(tsk); else #endif set_task_rq(tsk, task_cpu(tsk)); -- cgit v1.2.3 From 446685e9bfa11174332fbb0b3218b37015fbf4ff Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 31 Aug 2015 15:12:56 +0300 Subject: sched/core: Delete PF_EXITING checks from cpu_cgroup_exit() callback cgroup_exit() is not called from copy_process() after commit: e8604cb43690 ("cgroup: fix spurious lockdep warning in cgroup_exit()") from do_exit(). So this check is useless and the comment is obsolete. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/55E444C8.3020402@odin.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7c099e69fd5e..37ab6f9f0d1e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8193,14 +8193,6 @@ static void cpu_cgroup_exit(struct cgroup_subsys_state *css, struct cgroup_subsys_state *old_css, struct task_struct *task) { - /* - * cgroup_exit() is called in the copy_process() failure path. - * Ignore this case since the task hasn't ran yet, this avoids - * trying to poke a half freed task state from generic code. - */ - if (!(task->flags & PF_EXITING)) - return; - sched_move_task(task); } -- cgit v1.2.3 From 78a9c54649ea220065aad9902460a1d137c7eafd Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 11 Aug 2015 16:30:11 +0530 Subject: sched/numa: Rename numabalancing_enabled to sched_numa_balancing Simple rename of the 'numabalancing_enabled' variable to 'sched_numa_balancing'. No functional changes. Suggested-by: Ingo Molnar Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1439290813-6683-2-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +++--- kernel/sched/fair.c | 4 ++-- kernel/sched/sched.h | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 37ab6f9f0d1e..2656af00ea5e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2124,11 +2124,11 @@ void set_numabalancing_state(bool enabled) sched_feat_set("NO_NUMA"); } #else -__read_mostly bool numabalancing_enabled; +__read_mostly bool sched_numa_balancing; void set_numabalancing_state(bool enabled) { - numabalancing_enabled = enabled; + sched_numa_balancing = enabled; } #endif /* CONFIG_SCHED_DEBUG */ @@ -2138,7 +2138,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, { struct ctl_table t; int err; - int state = numabalancing_enabled; + int state = sched_numa_balancing; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 36774e5dab01..3a6ac5598bff 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2069,7 +2069,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) int local = !!(flags & TNF_FAULT_LOCAL); int priv; - if (!numabalancing_enabled) + if (!sched_numa_balancing) return; /* for example, ksmd faulting in a user's mm */ @@ -7874,7 +7874,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) entity_tick(cfs_rq, se, queued); } - if (numabalancing_enabled) + if (sched_numa_balancing) task_tick_numa(rq, curr); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 637d5aeaa55a..d0b303d675d6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1006,13 +1006,13 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #ifdef CONFIG_NUMA_BALANCING #define sched_feat_numa(x) sched_feat(x) #ifdef CONFIG_SCHED_DEBUG -#define numabalancing_enabled sched_feat_numa(NUMA) +#define sched_numa_balancing sched_feat_numa(NUMA) #else -extern bool numabalancing_enabled; +extern bool sched_numa_balancing; #endif /* CONFIG_SCHED_DEBUG */ #else #define sched_feat_numa(x) (0) -#define numabalancing_enabled (0) +#define sched_numa_balancing (0) #endif /* CONFIG_NUMA_BALANCING */ static inline u64 global_rt_period(void) -- cgit v1.2.3 From c3b9bc5bbfc3750570d788afffd431263ef695c6 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 11 Aug 2015 16:30:12 +0530 Subject: sched/numa: Disable sched_numa_balancing on UMA systems Commit 2a1ed24 ("sched/numa: Prefer NUMA hotness over cache hotness") sets sched feature NUMA to true. However this can enable NUMA hinting faults on a UMA system. This commit ensures that NUMA hinting faults occur only on a NUMA system by setting/resetting sched_numa_balancing. This commit: - Makes sched_numa_balancing common to CONFIG_SCHED_DEBUG and !CONFIG_SCHED_DEBUG. Earlier it was only in !CONFIG_SCHED_DEBUG. - Checks for sched_numa_balancing instead of sched_feat(NUMA). Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1439290813-6683-3-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 14 +++++--------- kernel/sched/fair.c | 4 ++-- kernel/sched/sched.h | 6 ------ 3 files changed, 7 insertions(+), 17 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2656af00ea5e..ca665f8bec98 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2115,22 +2115,18 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) } #ifdef CONFIG_NUMA_BALANCING -#ifdef CONFIG_SCHED_DEBUG +__read_mostly bool sched_numa_balancing; + void set_numabalancing_state(bool enabled) { + sched_numa_balancing = enabled; +#ifdef CONFIG_SCHED_DEBUG if (enabled) sched_feat_set("NUMA"); else sched_feat_set("NO_NUMA"); -} -#else -__read_mostly bool sched_numa_balancing; - -void set_numabalancing_state(bool enabled) -{ - sched_numa_balancing = enabled; -} #endif /* CONFIG_SCHED_DEBUG */ +} #ifdef CONFIG_PROC_SYSCTL int sysctl_numa_balancing(struct ctl_table *table, int write, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3a6ac5598bff..e8f0828f4137 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5562,10 +5562,10 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) unsigned long src_faults, dst_faults; int src_nid, dst_nid; - if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) + if (!sched_numa_balancing) return -1; - if (!sched_feat(NUMA)) + if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) return -1; src_nid = cpu_to_node(env->src_cpu); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d0b303d675d6..0d8f885b215b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1004,14 +1004,8 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ #ifdef CONFIG_NUMA_BALANCING -#define sched_feat_numa(x) sched_feat(x) -#ifdef CONFIG_SCHED_DEBUG -#define sched_numa_balancing sched_feat_numa(NUMA) -#else extern bool sched_numa_balancing; -#endif /* CONFIG_SCHED_DEBUG */ #else -#define sched_feat_numa(x) (0) #define sched_numa_balancing (0) #endif /* CONFIG_NUMA_BALANCING */ -- cgit v1.2.3 From 2b49d84b259fc18e131026e5d38e7855352f71b9 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 11 Aug 2015 16:30:13 +0530 Subject: sched/numa: Remove the NUMA sched_feature Variable sched_numa_balancing is available for both CONFIG_SCHED_DEBUG and !CONFIG_SCHED_DEBUG. All code paths now check for sched_numa_balancing. Hence remove sched_feat(NUMA). Suggested-by: Ingo Molnar Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1439290813-6683-4-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 ------ kernel/sched/features.h | 16 ---------------- 2 files changed, 22 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ca665f8bec98..e0bd88b26a27 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2120,12 +2120,6 @@ __read_mostly bool sched_numa_balancing; void set_numabalancing_state(bool enabled) { sched_numa_balancing = enabled; -#ifdef CONFIG_SCHED_DEBUG - if (enabled) - sched_feat_set("NUMA"); - else - sched_feat_set("NO_NUMA"); -#endif /* CONFIG_SCHED_DEBUG */ } #ifdef CONFIG_PROC_SYSCTL diff --git a/kernel/sched/features.h b/kernel/sched/features.h index e6fd23b7459b..edf5902d5e57 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -72,21 +72,5 @@ SCHED_FEAT(RT_PUSH_IPI, true) SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) - SCHED_FEAT(ATTACH_AGE_LOAD, true) -/* - * Apply the automatic NUMA scheduling policy. Enabled automatically - * at runtime if running on a NUMA machine. Can be controlled via - * numa_balancing= - */ -#ifdef CONFIG_NUMA_BALANCING - -/* - * NUMA will favor moving tasks towards nodes where a higher number of - * hinting faults are recorded during active load balancing. It will - * resist moving tasks towards nodes where a lower number of hinting - * faults have been recorded. - */ -SCHED_FEAT(NUMA, true) -#endif -- cgit v1.2.3 From 2a595721a1fa6b684c1c818f379bef834ac3d65e Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 11 Aug 2015 21:54:21 +0530 Subject: sched/numa: Convert sched_numa_balancing to a static_branch Variable sched_numa_balancing toggles numa_balancing feature. Hence moving from a simple read mostly variable to a more apt static_branch. Suggested-by: Peter Zijlstra Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1439310261-16124-1-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 10 +++++++--- kernel/sched/fair.c | 6 +++--- kernel/sched/sched.h | 6 +----- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e0bd88b26a27..b62127118fc8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2114,12 +2114,16 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif /* CONFIG_NUMA_BALANCING */ } +DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); + #ifdef CONFIG_NUMA_BALANCING -__read_mostly bool sched_numa_balancing; void set_numabalancing_state(bool enabled) { - sched_numa_balancing = enabled; + if (enabled) + static_branch_enable(&sched_numa_balancing); + else + static_branch_disable(&sched_numa_balancing); } #ifdef CONFIG_PROC_SYSCTL @@ -2128,7 +2132,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, { struct ctl_table t; int err; - int state = sched_numa_balancing; + int state = static_branch_likely(&sched_numa_balancing); if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e8f0828f4137..47ece22e7ae1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2069,7 +2069,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) int local = !!(flags & TNF_FAULT_LOCAL); int priv; - if (!sched_numa_balancing) + if (!static_branch_likely(&sched_numa_balancing)) return; /* for example, ksmd faulting in a user's mm */ @@ -5562,7 +5562,7 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) unsigned long src_faults, dst_faults; int src_nid, dst_nid; - if (!sched_numa_balancing) + if (!static_branch_likely(&sched_numa_balancing)) return -1; if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) @@ -7874,7 +7874,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) entity_tick(cfs_rq, se, queued); } - if (sched_numa_balancing) + if (!static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0d8f885b215b..2e8530d02b02 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1003,11 +1003,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ -#ifdef CONFIG_NUMA_BALANCING -extern bool sched_numa_balancing; -#else -#define sched_numa_balancing (0) -#endif /* CONFIG_NUMA_BALANCING */ +extern struct static_key_false sched_numa_balancing; static inline u64 global_rt_period(void) { -- cgit v1.2.3 From 98d8fd8126676f7ba6e133e65b2ca4b17989d32c Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 14 Aug 2015 17:23:14 +0100 Subject: sched/fair: Initialize task load and utilization before placing task on rq Task load or utilization is not currently considered in select_task_rq_fair(), but if we want that in the future we should make sure it is not zero for new tasks. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: daniel.lezcano@linaro.org Cc: mturquette@baylibre.com Cc: pang.xunlei@zte.com.cn Cc: rjw@rjwysocki.net Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1439569394-11974-7-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b62127118fc8..6ab415aa15c4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2343,6 +2343,8 @@ void wake_up_new_task(struct task_struct *p) struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, flags); + /* Initialize new task's runnable average */ + init_entity_runnable_average(&p->se); #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: @@ -2352,8 +2354,6 @@ void wake_up_new_task(struct task_struct *p) set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif - /* Initialize new task's runnable average */ - init_entity_runnable_average(&p->se); rq = __task_rq_lock(p); activate_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; -- cgit v1.2.3 From de9b8f5dcbd94bfb1d249907a635f1fb1968e19c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2015 23:09:29 +0200 Subject: sched: Fix crash trying to dequeue/enqueue the idle thread Sasha reports that his virtual machine tries to schedule the idle thread since commit 6c37067e2786 ("sched: Change the sched_class::set_cpus_allowed() calling context"). Hit trace shows this happening from idle_thread_get()->init_idle(), which is the _second_ init_idle() invocation on that task_struct, the first being done through idle_init()->fork_idle(). (this code is insane...) Because we call init_idle() twice in a row, its ->sched_class == &idle_sched_class and ->on_rq = TASK_ON_RQ_QUEUED. This means do_set_cpus_allowed() think we're queued and will call dequeue_task(), which is implemented with BUG() for the idle class, seeing how dequeueing the idle task is a daft thing. Aside of the whole insanity of calling init_idle() _twice_, change the code to call set_cpus_allowed_common() instead as this is 'obviously' before the idle task gets ran etc.. Reported-by: Sasha Levin Tested-by: Sasha Levin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 6c37067e2786 ("sched: Change the sched_class::set_cpus_allowed() calling context") Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 97d276ff1edb..f0d043ec0182 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4927,7 +4927,15 @@ void init_idle(struct task_struct *idle, int cpu) idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); - do_set_cpus_allowed(idle, cpumask_of(cpu)); +#ifdef CONFIG_SMP + /* + * Its possible that init_idle() gets called multiple times on a task, + * in that case do_set_cpus_allowed() will not do the right thing. + * + * And since this is boot we can forgo the serialization. + */ + set_cpus_allowed_common(idle, cpumask_of(cpu)); +#endif /* * We're having a chicken and egg problem, even though we are * holding rq->lock, the cpu isn't yet set to this cpu so the @@ -4944,7 +4952,7 @@ void init_idle(struct task_struct *idle, int cpu) rq->curr = rq->idle = idle; idle->on_rq = TASK_ON_RQ_QUEUED; -#if defined(CONFIG_SMP) +#ifdef CONFIG_SMP idle->on_cpu = 1; #endif raw_spin_unlock(&rq->lock); @@ -4959,7 +4967,7 @@ void init_idle(struct task_struct *idle, int cpu) idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); vtime_init_idle(idle, cpu); -#if defined(CONFIG_SMP) +#ifdef CONFIG_SMP sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); #endif } -- cgit v1.2.3 From 20f9cd2acb1d74a8bf4b4087267f586e6ecdbc03 Mon Sep 17 00:00:00 2001 From: Henrik Austad Date: Wed, 9 Sep 2015 17:00:41 +0200 Subject: sched/core: Make policy-testing consistent Most of the policy-tests are done via the _policy() helpers with the notable exception of idle. A new wrapper for valid_policy() has also been added to improve readability in set_load_weight(). This commit does not change the logical behavior of the scheduler core. Signed-off-by: Henrik Austad Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1441810841-4756-1-git-send-email-henrik@austad.us Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 +++------ kernel/sched/sched.h | 9 +++++++++ 2 files changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6ab415aa15c4..1b30b5b24a4a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -817,7 +817,7 @@ static void set_load_weight(struct task_struct *p) /* * SCHED_IDLE tasks get minimal weight: */ - if (p->policy == SCHED_IDLE) { + if (idle_policy(p->policy)) { load->weight = scale_load(WEIGHT_IDLEPRIO); load->inv_weight = WMULT_IDLEPRIO; return; @@ -3733,10 +3733,7 @@ recheck: } else { reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); - if (policy != SCHED_DEADLINE && - policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL && policy != SCHED_BATCH && - policy != SCHED_IDLE) + if (!valid_policy(policy)) return -EINVAL; } @@ -3792,7 +3789,7 @@ recheck: * Treat SCHED_IDLE as nice 20. Only allow a switch to * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. */ - if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { + if (idle_policy(p->policy) && !idle_policy(policy)) { if (!can_nice(p, task_nice(p))) return -EPERM; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 167ab4844ee6..3845a711c65e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -84,6 +84,10 @@ static inline void update_cpu_load_active(struct rq *this_rq) { } */ #define RUNTIME_INF ((u64)~0ULL) +static inline int idle_policy(int policy) +{ + return policy == SCHED_IDLE; +} static inline int fair_policy(int policy) { return policy == SCHED_NORMAL || policy == SCHED_BATCH; @@ -98,6 +102,11 @@ static inline int dl_policy(int policy) { return policy == SCHED_DEADLINE; } +static inline bool valid_policy(int policy) +{ + return idle_policy(policy) || fair_policy(policy) || + rt_policy(policy) || dl_policy(policy); +} static inline int task_has_rt_policy(struct task_struct *p) { -- cgit v1.2.3 From c6e1e7b5b7f031910850ddaf7bfa65ba3b4843ea Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 22 Sep 2015 12:48:59 +0200 Subject: sched/core: Make 'sched_domain_topology' declaration static The 'sched_domain_topology' variable is only used within kernel/sched/core.c. Make it static. Signed-off-by: Juergen Gross Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1442918939-9907-1-git-send-email-jgross@suse.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 -- kernel/sched/core.c | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index bd38b3ee9e83..699228bb0035 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1127,8 +1127,6 @@ struct sched_domain_topology_level { #endif }; -extern struct sched_domain_topology_level *sched_domain_topology; - extern void set_sched_topology(struct sched_domain_topology_level *tl); extern void wake_up_if_idle(int cpu); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1b30b5b24a4a..a91df6171f48 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6445,7 +6445,8 @@ static struct sched_domain_topology_level default_topology[] = { { NULL, }, }; -struct sched_domain_topology_level *sched_domain_topology = default_topology; +static struct sched_domain_topology_level *sched_domain_topology = + default_topology; #define for_each_sd_topology(tl) \ for (tl = sched_domain_topology; tl->mask; tl++) -- cgit v1.2.3 From 95913d97914f44db2b81271c2e2ebd4d2ac2df83 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 29 Sep 2015 14:45:09 +0200 Subject: sched/core: Fix TASK_DEAD race in finish_task_switch() So the problem this patch is trying to address is as follows: CPU0 CPU1 context_switch(A, B) ttwu(A) LOCK A->pi_lock A->on_cpu == 0 finish_task_switch(A) prev_state = A->state <-. WMB | A->on_cpu = 0; | UNLOCK rq0->lock | | context_switch(C, A) `-- A->state = TASK_DEAD prev_state == TASK_DEAD put_task_struct(A) context_switch(A, C) finish_task_switch(A) A->state == TASK_DEAD put_task_struct(A) The argument being that the WMB will allow the load of A->state on CPU0 to cross over and observe CPU1's store of A->state, which will then result in a double-drop and use-after-free. Now the comment states (and this was true once upon a long time ago) that we need to observe A->state while holding rq->lock because that will order us against the wakeup; however the wakeup will not in fact acquire (that) rq->lock; it takes A->pi_lock these days. We can obviously fix this by upgrading the WMB to an MB, but that is expensive, so we'd rather avoid that. The alternative this patch takes is: smp_store_release(&A->on_cpu, 0), which avoids the MB on some archs, but not important ones like ARM. Reported-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Acked-by: Linus Torvalds Cc: # v3.1+ Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Cc: manfred@colorfullife.com Cc: will.deacon@arm.com Fixes: e4a52bcb9a18 ("sched: Remove rq->lock from the first half of ttwu()") Link: http://lkml.kernel.org/r/20150929124509.GG3816@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 10 +++++----- kernel/sched/sched.h | 5 +++-- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 615953141951..10a8faa1b0d4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2517,11 +2517,11 @@ static struct rq *finish_task_switch(struct task_struct *prev) * If a task dies, then it sets TASK_DEAD in tsk->state and calls * schedule one last time. The schedule call will never return, and * the scheduled task must drop that reference. - * The test for TASK_DEAD must occur while the runqueue locks are - * still held, otherwise prev could be scheduled on another cpu, die - * there before we look at prev->state, and then the reference would - * be dropped twice. - * Manfred Spraul + * + * We must observe prev->state before clearing prev->on_cpu (in + * finish_lock_switch), otherwise a concurrent wakeup can get prev + * running on another CPU and we could rave with its RUNNING -> DEAD + * transition, resulting in a double drop. */ prev_state = prev->state; vtime_task_switch(prev); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 68cda117574c..6d2a119c7ad9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1078,9 +1078,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) * After ->on_cpu is cleared, the task can be moved to a different CPU. * We must ensure this doesn't happen until the switch is completely * finished. + * + * Pairs with the control dependency and rmb in try_to_wake_up(). */ - smp_wmb(); - prev->on_cpu = 0; + smp_store_release(&prev->on_cpu, 0); #endif #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ -- cgit v1.2.3 From b99def8b961448f5b9a550dddeeb718e3975e7a6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Sep 2015 18:02:03 +0200 Subject: sched/core: Rework TASK_DEAD preemption exception TASK_DEAD is special in that the final schedule call from do_exit() must be done with preemption disabled. This means we end up scheduling with a preempt_count() higher than usual (3 instead of the 'expected' 2). Since future patches will want to rely on an invariant preempt_count() value during schedule, fix this up. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Reviewed-by: Steven Rostedt Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 88a425443ff4..530fe8baa645 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2949,12 +2949,8 @@ static inline void schedule_debug(struct task_struct *prev) #ifdef CONFIG_SCHED_STACK_END_CHECK BUG_ON(unlikely(task_stack_end_corrupted(prev))); #endif - /* - * Test if we are atomic. Since do_exit() needs to call into - * schedule() atomically, we ignore that path. Otherwise whine - * if we are scheduling when we should not. - */ - if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) + + if (unlikely(in_atomic_preempt_off())) __schedule_bug(prev); rcu_sleep_check(); @@ -3053,6 +3049,17 @@ static void __sched __schedule(void) rcu_note_context_switch(); prev = rq->curr; + /* + * do_exit() calls schedule() with preemption disabled as an exception; + * however we must fix that up, otherwise the next task will see an + * inconsistent (higher) preempt count. + * + * It also avoids the below schedule_debug() test from complaining + * about this. + */ + if (unlikely(prev->state == TASK_DEAD)) + preempt_enable_no_resched_notrace(); + schedule_debug(prev); if (sched_feat(HRTICK)) -- cgit v1.2.3 From 609ca066386b2e64d4c0b0f55da327654962a0c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Sep 2015 17:52:18 +0200 Subject: sched/core: Create preempt_count invariant Assuming units of PREEMPT_DISABLE_OFFSET for preempt_count() numbers. Now that TASK_DEAD no longer results in preempt_count() == 3 during scheduling, we will always call context_switch() with preempt_count() == 2. However, we don't always end up with preempt_count() == 2 in finish_task_switch() because new tasks get created with preempt_count() == 1. Create FORK_PREEMPT_COUNT and set it to 2 and use that in the right places. Note that we cannot use INIT_PREEMPT_COUNT as that serves another purpose (boot). After this, preempt_count() is invariant across the context switch, with exception of PREEMPT_ACTIVE. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/preempt.h | 2 +- include/asm-generic/preempt.h | 2 +- include/linux/sched.h | 17 ++++++++++++----- kernel/sched/core.c | 23 +++++++++++++++++++++-- 4 files changed, 35 insertions(+), 9 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index b12f81022a6b..01e700d392cb 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -31,7 +31,7 @@ static __always_inline void preempt_count_set(int pc) * must be macros to avoid header recursion hell */ #define init_task_preempt_count(p) do { \ - task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \ + task_thread_info(p)->saved_preempt_count = FORK_PREEMPT_COUNT; \ } while (0) #define init_idle_preempt_count(p, cpu) do { \ diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h index 0bec580a4885..5d8ffa3e6f8c 100644 --- a/include/asm-generic/preempt.h +++ b/include/asm-generic/preempt.h @@ -24,7 +24,7 @@ static __always_inline void preempt_count_set(int pc) * must be macros to avoid header recursion hell */ #define init_task_preempt_count(p) do { \ - task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \ + task_thread_info(p)->preempt_count = FORK_PREEMPT_COUNT; \ } while (0) #define init_idle_preempt_count(p, cpu) do { \ diff --git a/include/linux/sched.h b/include/linux/sched.h index e5b8cbc4b8d6..23ca455d9582 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -599,11 +599,7 @@ struct task_cputime_atomic { .sum_exec_runtime = ATOMIC64_INIT(0), \ } -#ifdef CONFIG_PREEMPT_COUNT -#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED) -#else -#define PREEMPT_DISABLED PREEMPT_ENABLED -#endif +#define PREEMPT_DISABLED (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) /* * Disable preemption until the scheduler is running -- use an unconditional @@ -613,6 +609,17 @@ struct task_cputime_atomic { */ #define INIT_PREEMPT_COUNT PREEMPT_OFFSET +/* + * Initial preempt_count value; reflects the preempt_count schedule invariant + * which states that during context switches: + * + * preempt_count() == 2*PREEMPT_DISABLE_OFFSET + * + * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels. + * Note: See finish_task_switch(). + */ +#define FORK_PREEMPT_COUNT (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) + /** * struct thread_group_cputimer - thread group interval timer counts * @cputime_atomic: atomic thread group interval timers. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 530fe8baa645..8d8722b84dee 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2504,6 +2504,18 @@ static struct rq *finish_task_switch(struct task_struct *prev) struct mm_struct *mm = rq->prev_mm; long prev_state; + /* + * The previous task will have left us with a preempt_count of 2 + * because it left us after: + * + * schedule() + * preempt_disable(); // 1 + * __schedule() + * raw_spin_lock_irq(&rq->lock) // 2 + * + * Also, see FORK_PREEMPT_COUNT. + */ + rq->prev_mm = NULL; /* @@ -2588,8 +2600,15 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) { struct rq *rq; - /* finish_task_switch() drops rq->lock and enables preemtion */ - preempt_disable(); + /* + * New tasks start with FORK_PREEMPT_COUNT, see there and + * finish_task_switch() for details. + * + * finish_task_switch() will drop rq->lock() and lower preempt_count + * and the preempt_enable() will end up enabling preemption (on + * PREEMPT_COUNT kernels). + */ + rq = finish_task_switch(prev); balance_callback(rq); preempt_enable(); -- cgit v1.2.3 From fc13aebab7d8f0d19d557c721a0f25cdf7ae905c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Sep 2015 18:05:34 +0200 Subject: sched/core: Add preempt argument to __schedule() There is only a single PREEMPT_ACTIVE use in the regular __schedule() path and that is to circumvent the task->state check. Since the code setting PREEMPT_ACTIVE is the immediate caller of __schedule() we can replace this with a function argument. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Reviewed-by: Steven Rostedt Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8d8722b84dee..0a71f89fb3fc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3056,7 +3056,7 @@ again: * * WARNING: must be called with preemption disabled! */ -static void __sched __schedule(void) +static void __sched __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; @@ -3096,7 +3096,7 @@ static void __sched __schedule(void) rq->clock_skip_update <<= 1; /* promote REQ to ACT */ switch_count = &prev->nivcsw; - if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + if (!preempt && prev->state) { if (unlikely(signal_pending_state(prev->state, prev))) { prev->state = TASK_RUNNING; } else { @@ -3161,7 +3161,7 @@ asmlinkage __visible void __sched schedule(void) sched_submit_work(tsk); do { preempt_disable(); - __schedule(); + __schedule(false); sched_preempt_enable_no_resched(); } while (need_resched()); } @@ -3202,7 +3202,7 @@ static void __sched notrace preempt_schedule_common(void) { do { preempt_active_enter(); - __schedule(); + __schedule(true); preempt_active_exit(); /* @@ -3267,7 +3267,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) * an infinite recursion. */ prev_ctx = exception_enter(); - __schedule(); + __schedule(true); exception_exit(prev_ctx); barrier(); @@ -3296,7 +3296,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) do { preempt_active_enter(); local_irq_enable(); - __schedule(); + __schedule(true); local_irq_disable(); preempt_active_exit(); } while (need_resched()); -- cgit v1.2.3 From c73464b1c8434ad4cbfd5369c3e724f3e8ffe5a4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Sep 2015 18:06:56 +0200 Subject: sched/core: Fix trace_sched_switch() __trace_sched_switch_state() is the last remaining PREEMPT_ACTIVE user, move trace_sched_switch() from prepare_task_switch() to __schedule() and propagate the @preempt argument. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Steven Rostedt Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/trace/events/sched.h | 22 +++++++++------------- kernel/sched/core.c | 2 +- kernel/trace/ftrace.c | 2 +- kernel/trace/trace_sched_switch.c | 3 ++- kernel/trace/trace_sched_wakeup.c | 2 +- 5 files changed, 14 insertions(+), 17 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 539d6bc3216a..9b90c57517a9 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -104,22 +104,17 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new, TP_ARGS(p)); #ifdef CREATE_TRACE_POINTS -static inline long __trace_sched_switch_state(struct task_struct *p) +static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p) { - long state = p->state; - -#ifdef CONFIG_PREEMPT #ifdef CONFIG_SCHED_DEBUG BUG_ON(p != current); #endif /* CONFIG_SCHED_DEBUG */ + /* - * For all intents and purposes a preempted task is a running task. + * Preemption ignores task state, therefore preempted tasks are always + * RUNNING (we will not have dequeued if state != RUNNING). */ - if (preempt_count() & PREEMPT_ACTIVE) - state = TASK_RUNNING | TASK_STATE_MAX; -#endif /* CONFIG_PREEMPT */ - - return state; + return preempt ? TASK_RUNNING | TASK_STATE_MAX : p->state; } #endif /* CREATE_TRACE_POINTS */ @@ -128,10 +123,11 @@ static inline long __trace_sched_switch_state(struct task_struct *p) */ TRACE_EVENT(sched_switch, - TP_PROTO(struct task_struct *prev, + TP_PROTO(bool preempt, + struct task_struct *prev, struct task_struct *next), - TP_ARGS(prev, next), + TP_ARGS(preempt, prev, next), TP_STRUCT__entry( __array( char, prev_comm, TASK_COMM_LEN ) @@ -147,7 +143,7 @@ TRACE_EVENT(sched_switch, memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); __entry->prev_pid = prev->pid; __entry->prev_prio = prev->prio; - __entry->prev_state = __trace_sched_switch_state(prev); + __entry->prev_state = __trace_sched_switch_state(preempt, prev); memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); __entry->next_pid = next->pid; __entry->next_prio = next->prio; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0a71f89fb3fc..cfad7f5f74f8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2470,7 +2470,6 @@ static inline void prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - trace_sched_switch(prev, next); sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); @@ -3132,6 +3131,7 @@ static void __sched __schedule(bool preempt) rq->curr = next; ++*switch_count; + trace_sched_switch(preempt, prev, next); rq = context_switch(rq, prev, next); /* unlocks the rq */ cpu = cpu_of(rq); } else { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b0623ac785a2..00611e95a8ee 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5697,7 +5697,7 @@ free: } static void -ftrace_graph_probe_sched_switch(void *ignore, +ftrace_graph_probe_sched_switch(void *ignore, bool preempt, struct task_struct *prev, struct task_struct *next) { unsigned long long timestamp; diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index f270088e9929..4c896a0101bd 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -16,7 +16,8 @@ static int sched_ref; static DEFINE_MUTEX(sched_register_mutex); static void -probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) +probe_sched_switch(void *ignore, bool preempt, + struct task_struct *prev, struct task_struct *next) { if (unlikely(!sched_ref)) return; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 12cbe77b4136..4bcfbac289ff 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -420,7 +420,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, } static void notrace -probe_wakeup_sched_switch(void *ignore, +probe_wakeup_sched_switch(void *ignore, bool preempt, struct task_struct *prev, struct task_struct *next) { struct trace_array_cpu *data; -- cgit v1.2.3 From 3d8f74dd4ca1da8a1a464bbafcf679e40c2fc10f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Sep 2015 18:09:19 +0200 Subject: sched/core: Stop setting PREEMPT_ACTIVE Now that nothing tests for PREEMPT_ACTIVE anymore, stop setting it. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Steven Rostedt Reviewed-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 12 ------------ kernel/sched/core.c | 19 ++++++------------- 2 files changed, 6 insertions(+), 25 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index bea8dd8ff5e0..448dfd0b2ea6 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -146,18 +146,6 @@ extern void preempt_count_sub(int val); #define preempt_count_inc() preempt_count_add(1) #define preempt_count_dec() preempt_count_sub(1) -#define preempt_active_enter() \ -do { \ - preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \ - barrier(); \ -} while (0) - -#define preempt_active_exit() \ -do { \ - barrier(); \ - preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \ -} while (0) - #ifdef CONFIG_PREEMPT_COUNT #define preempt_disable() \ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cfad7f5f74f8..6344d82a84f6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3201,9 +3201,9 @@ void __sched schedule_preempt_disabled(void) static void __sched notrace preempt_schedule_common(void) { do { - preempt_active_enter(); + preempt_disable(); __schedule(true); - preempt_active_exit(); + sched_preempt_enable_no_resched(); /* * Check again in case we missed a preemption opportunity @@ -3254,13 +3254,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) return; do { - /* - * Use raw __prempt_count() ops that don't call function. - * We can't call functions before disabling preemption which - * disarm preemption tracing recursions. - */ - __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); - barrier(); + preempt_disable_notrace(); /* * Needs preempt disabled in case user_exit() is traced * and the tracer calls preempt_enable_notrace() causing @@ -3270,8 +3264,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) __schedule(true); exception_exit(prev_ctx); - barrier(); - __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); + preempt_enable_no_resched_notrace(); } while (need_resched()); } EXPORT_SYMBOL_GPL(preempt_schedule_notrace); @@ -3294,11 +3287,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) prev_state = exception_enter(); do { - preempt_active_enter(); + preempt_disable(); local_irq_enable(); __schedule(true); local_irq_disable(); - preempt_active_exit(); + sched_preempt_enable_no_resched(); } while (need_resched()); exception_exit(prev_state); -- cgit v1.2.3 From 1dc0fffc48af94513e621f95dff730ed4f7317ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Sep 2015 17:57:39 +0200 Subject: sched/core: Robustify preemption leak checks When we warn about a preempt_count leak; reset the preempt_count to the known good value such that the problem does not ripple forward. This is most important on x86 which has a per cpu preempt_count that is not saved/restored (after this series). So if you schedule with an invalid (!2*PREEMPT_DISABLE_OFFSET) preempt_count the next task is messed up too. Enforcing this invariant limits the borkage to just the one task. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Reviewed-by: Steven Rostedt Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/exit.c | 4 +++- kernel/sched/core.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/exit.c b/kernel/exit.c index ea95ee1b5ef7..443677c8efe6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -706,10 +706,12 @@ void do_exit(long code) smp_mb(); raw_spin_unlock_wait(&tsk->pi_lock); - if (unlikely(in_atomic())) + if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), preempt_count()); + preempt_count_set(PREEMPT_ENABLED); + } /* sync mm's RSS info before statistics gathering */ if (tsk->mm) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6344d82a84f6..d6989f85c641 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2968,8 +2968,10 @@ static inline void schedule_debug(struct task_struct *prev) BUG_ON(unlikely(task_stack_end_corrupted(prev))); #endif - if (unlikely(in_atomic_preempt_off())) + if (unlikely(in_atomic_preempt_off())) { __schedule_bug(prev); + preempt_count_set(PREEMPT_DISABLED); + } rcu_sleep_check(); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -- cgit v1.2.3 From da7142e2ed735e1c1bef5a757dc55de35c65fbd6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Sep 2015 18:11:45 +0200 Subject: sched/core: Simplify preempt_count tests Since we stopped setting PREEMPT_ACTIVE, there is no need to mask it out of preempt_count() tests. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Steven Rostedt Reviewed-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 3 +-- kernel/sched/core.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 448dfd0b2ea6..b2676a16cfbe 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -126,8 +126,7 @@ * Check whether we were atomic before we did preempt_disable(): * (used by the scheduler) */ -#define in_atomic_preempt_off() \ - ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_DISABLE_OFFSET) +#define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET) #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) extern void preempt_count_add(int val); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d6989f85c641..ca260cc5d881 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7486,7 +7486,7 @@ void __init sched_init(void) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline int preempt_count_equals(int preempt_offset) { - int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); + int nested = preempt_count() + rcu_preempt_depth(); return (nested == preempt_offset); } -- cgit v1.2.3 From 499d79559ffe4b9c0c3031752f6a40abd532fb75 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Sep 2015 18:52:36 +0200 Subject: sched/core: More notrace annotations preempt_schedule_common() is marked notrace, but it does not use _notrace() preempt_count functions and __schedule() is also not marked notrace, which means that its perfectly possible to end up in the tracer from preempt_schedule_common(). Steve says: | Yep, there's some history to this. This was originally the issue that | caused function tracing to go into infinite recursion. But now we have | preempt_schedule_notrace(), which is used by the function tracer, and | that function must not be traced till preemption is disabled. | | Now if function tracing is running and we take an interrupt when | NEED_RESCHED is set, it calls | | preempt_schedule_common() (not traced) | | But then that calls preempt_disable() (traced) | | function tracer calls preempt_disable_notrace() followed by | preempt_enable_notrace() which will see NEED_RESCHED set, and it will | call preempt_schedule_notrace(), which stops the recursion, but | still calls __schedule() here, and that means when we return, we call | the __schedule() from preempt_schedule_common(). | | That said, I prefer this patch. Preemption is disabled before calling | __schedule(), and we get rid of a one round recursion with the | scheduler. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Acked-by: Steven Rostedt Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ca260cc5d881..98c4cf8182cf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3057,7 +3057,7 @@ again: * * WARNING: must be called with preemption disabled! */ -static void __sched __schedule(bool preempt) +static void __sched notrace __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; @@ -3203,9 +3203,9 @@ void __sched schedule_preempt_disabled(void) static void __sched notrace preempt_schedule_common(void) { do { - preempt_disable(); + preempt_disable_notrace(); __schedule(true); - sched_preempt_enable_no_resched(); + preempt_enable_no_resched_notrace(); /* * Check again in case we missed a preemption opportunity -- cgit v1.2.3 From e2bf1c4b17aff25f07e0d2952d8c1c66643f33fe Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 29 Sep 2015 12:18:46 +0200 Subject: sched/core: Add preempt_count invariant check Ingo requested I keep my debug check for the preempt_count invariant. Requested-by: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 98c4cf8182cf..4554cde2f6f9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2514,6 +2514,10 @@ static struct rq *finish_task_switch(struct task_struct *prev) * * Also, see FORK_PREEMPT_COUNT. */ + if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, + "corrupted preempt_count: %s/%d/0x%x\n", + current->comm, current->pid, preempt_count())) + preempt_count_set(FORK_PREEMPT_COUNT); rq->prev_mm = NULL; -- cgit v1.2.3 From 1de64443d755f83af8ba8b558fded0c61afaef47 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 30 Sep 2015 17:44:13 +0200 Subject: sched/core: Fix task and run queue sched_info::run_delay inconsistencies Mike Meyer reported the following bug: > During evaluation of some performance data, it was discovered thread > and run queue run_delay accounting data was inconsistent with the other > accounting data that was collected. Further investigation found under > certain circumstances execution time was leaking into the task and > run queue accounting of run_delay. > > Consider the following sequence: > > a. thread is running. > b. thread moves beween cgroups, changes scheduling class or priority. > c. thread sleeps OR > d. thread involuntarily gives up cpu. > > a. implies: > > thread->sched_info.last_queued = 0 > > a. and b. results in the following: > > 1. dequeue_task(rq, thread) > > sched_info_dequeued(rq, thread) > delta = 0 > > sched_info_reset_dequeued(thread) > thread->sched_info.last_queued = 0 > > thread->sched_info.run_delay += delta > > 2. enqueue_task(rq, thread) > > sched_info_queued(rq, thread) > > /* thread is still on cpu at this point. */ > thread->sched_info.last_queued = task_rq(thread)->clock; > > c. results in: > > dequeue_task(rq, thread) > > sched_info_dequeued(rq, thread) > > /* delta is execution time not run_delay. */ > delta = task_rq(thread)->clock - thread->sched_info.last_queued > > sched_info_reset_dequeued(thread) > thread->sched_info.last_queued = 0 > > thread->sched_info.run_delay += delta > > Since thread was running between enqueue_task(rq, thread) and > dequeue_task(rq, thread), the delta above is really execution > time and not run_delay. > > d. results in: > > __sched_info_switch(thread, next_thread) > > sched_info_depart(rq, thread) > > sched_info_queued(rq, thread) > > /* last_queued not updated due to being non-zero */ > return > > Since thread was running between enqueue_task(rq, thread) and > __sched_info_switch(thread, next_thread), the execution time > between enqueue_task(rq, thread) and > __sched_info_switch(thread, next_thread) now will become > associated with run_delay due to when last_queued was last updated. > This alternative patch solves the problem by not calling sched_info_{de,}queued() in {de,en}queue_task(). Therefore the sched_info state is preserved and things work as expected. By inlining the {de,en}queue_task() functions the new condition becomes (mostly) a compile-time constant and we'll not emit any new branch instructions. It even shrinks the code (due to inlining {en,de}queue_task()): $ size defconfig-build/kernel/sched/core.o defconfig-build/kernel/sched/core.o.orig text data bss dec hex filename 64019 23378 2344 89741 15e8d defconfig-build/kernel/sched/core.o 64149 23378 2344 89871 15f0f defconfig-build/kernel/sched/core.o.orig Reported-by: Mike Meyer Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20150930154413.GO3604@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 44 +++++++++++++++++++++++++------------------- kernel/sched/sched.h | 14 ++++++++------ 2 files changed, 33 insertions(+), 25 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4554cde2f6f9..fb14a010426f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -827,17 +827,19 @@ static void set_load_weight(struct task_struct *p) load->inv_weight = prio_to_wmult[prio]; } -static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) +static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_queued(rq, p); + if (!(flags & ENQUEUE_RESTORE)) + sched_info_queued(rq, p); p->sched_class->enqueue_task(rq, p, flags); } -static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) +static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_dequeued(rq, p); + if (!(flags & DEQUEUE_SAVE)) + sched_info_dequeued(rq, p); p->sched_class->dequeue_task(rq, p, flags); } @@ -1178,7 +1180,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) * holding rq->lock. */ lockdep_assert_held(&rq->lock); - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); } if (running) put_prev_task(rq, p); @@ -1188,7 +1190,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) if (running) p->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, p, 0); + enqueue_task(rq, p, ENQUEUE_RESTORE); } /* @@ -1692,7 +1694,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) #endif /* CONFIG_SCHEDSTATS */ } -static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) +static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) { activate_task(rq, p, en_flags); p->on_rq = TASK_ON_RQ_QUEUED; @@ -3325,7 +3327,7 @@ EXPORT_SYMBOL(default_wake_function); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, queued, running, enqueue_flag = 0; + int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE; struct rq *rq; const struct sched_class *prev_class; @@ -3357,7 +3359,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); if (running) put_prev_task(rq, p); @@ -3375,7 +3377,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (!dl_prio(p->normal_prio) || (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; - enqueue_flag = ENQUEUE_REPLENISH; + enqueue_flag |= ENQUEUE_REPLENISH; } else p->dl.dl_boosted = 0; p->sched_class = &dl_sched_class; @@ -3383,7 +3385,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (dl_prio(oldprio)) p->dl.dl_boosted = 0; if (oldprio < prio) - enqueue_flag = ENQUEUE_HEAD; + enqueue_flag |= ENQUEUE_HEAD; p->sched_class = &rt_sched_class; } else { if (dl_prio(oldprio)) @@ -3435,7 +3437,7 @@ void set_user_nice(struct task_struct *p, long nice) } queued = task_on_rq_queued(p); if (queued) - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); @@ -3444,7 +3446,7 @@ void set_user_nice(struct task_struct *p, long nice) delta = p->prio - old_prio; if (queued) { - enqueue_task(rq, p, 0); + enqueue_task(rq, p, ENQUEUE_RESTORE); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -3946,7 +3948,7 @@ change: queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); if (running) put_prev_task(rq, p); @@ -3956,11 +3958,15 @@ change: if (running) p->sched_class->set_curr_task(rq); if (queued) { + int enqueue_flags = ENQUEUE_RESTORE; /* * We enqueue to tail when the priority of a task is * increased (user space view). */ - enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); + if (oldprio <= p->prio) + enqueue_flags |= ENQUEUE_HEAD; + + enqueue_task(rq, p, enqueue_flags); } check_class_changed(rq, p, prev_class, oldprio); @@ -5109,7 +5115,7 @@ void sched_setnuma(struct task_struct *p, int nid) running = task_current(rq, p); if (queued) - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); if (running) put_prev_task(rq, p); @@ -5118,7 +5124,7 @@ void sched_setnuma(struct task_struct *p, int nid) if (running) p->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, p, 0); + enqueue_task(rq, p, ENQUEUE_RESTORE); task_rq_unlock(rq, p, &flags); } #endif /* CONFIG_NUMA_BALANCING */ @@ -7737,7 +7743,7 @@ void sched_move_task(struct task_struct *tsk) queued = task_on_rq_queued(tsk); if (queued) - dequeue_task(rq, tsk, 0); + dequeue_task(rq, tsk, DEQUEUE_SAVE); if (unlikely(running)) put_prev_task(rq, tsk); @@ -7761,7 +7767,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, tsk, 0); + enqueue_task(rq, tsk, ENQUEUE_RESTORE); task_rq_unlock(rq, tsk, &flags); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 046242feff3a..e08cc4cf68e2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1151,16 +1151,18 @@ static const u32 prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -#define ENQUEUE_WAKEUP 1 -#define ENQUEUE_HEAD 2 +#define ENQUEUE_WAKEUP 0x01 +#define ENQUEUE_HEAD 0x02 #ifdef CONFIG_SMP -#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ +#define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */ #else -#define ENQUEUE_WAKING 0 +#define ENQUEUE_WAKING 0x00 #endif -#define ENQUEUE_REPLENISH 8 +#define ENQUEUE_REPLENISH 0x08 +#define ENQUEUE_RESTORE 0x10 -#define DEQUEUE_SLEEP 1 +#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SAVE 0x02 #define RETRY_TASK ((void *)-1UL) -- cgit v1.2.3 From ce03e4137bb22fc560ad7a07cf4138ae2cd59f65 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 5 Oct 2015 21:26:05 +0800 Subject: sched/core: Drop unlikely behind BUG_ON() (1) For !CONFIG_BUG cases, the bug call is a no-op, so we couldn't care less and the change is ok. (2) PPC and MIPS, which HAVE_ARCH_BUG_ON, do not rely on branch predictions as it seems to be pointless [1] and thus callers should not be trying to push an optimization in the first place. (3) For CONFIG_BUG and !HAVE_ARCH_BUG_ON cases, BUG_ON() contains an unlikely compiler flag already. Hence, we can drop unlikely behind BUG_ON(). [1] http://lkml.iu.edu/hypermail/linux/kernel/1101.3/02289.html Signed-off-by: Geliang Tang Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/6fa7125979f98bbeac26e268271769b6ca935c8d.1444051018.git.geliangtang@163.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fb14a010426f..a395db17ff4c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2971,7 +2971,7 @@ static noinline void __schedule_bug(struct task_struct *prev) static inline void schedule_debug(struct task_struct *prev) { #ifdef CONFIG_SCHED_STACK_END_CHECK - BUG_ON(unlikely(task_stack_end_corrupted(prev))); + BUG_ON(task_stack_end_corrupted(prev)); #endif if (unlikely(in_atomic_preempt_off())) { -- cgit v1.2.3 From 5a4fd0368517bc5b5399ef958f6d30cbff492918 Mon Sep 17 00:00:00 2001 From: "xiaofeng.yan" Date: Wed, 23 Sep 2015 14:55:59 +0800 Subject: sched/core: Remove a parameter in the migrate_task_rq() function The parameter "int next_cpu" in the following function is unused: migrate_task_rq(struct task_struct *p, int next_cpu) Remove it. Signed-off-by: xiaofeng.yan Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1442991360-31945-1-git-send-email-yanxiaofeng@inspur.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a395db17ff4c..1764a0f2a75b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1294,7 +1294,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_cpu(p) != new_cpu) { if (p->sched_class->migrate_task_rq) - p->sched_class->migrate_task_rq(p, new_cpu); + p->sched_class->migrate_task_rq(p); p->se.nr_migrations++; perf_event_task_migrate(p); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3bdc3da7bc6a..700eb548315f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5009,7 +5009,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f * previous cpu. However, the caller only guarantees p->pi_lock is held; no * other assumptions, including the state of rq->lock, should be made. */ -static void migrate_task_rq_fair(struct task_struct *p, int next_cpu) +static void migrate_task_rq_fair(struct task_struct *p) { /* * We are supposed to update the task to "current" time, then its up to date diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e08cc4cf68e2..efd3bfc7e347 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1190,7 +1190,7 @@ struct sched_class { #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); - void (*migrate_task_rq)(struct task_struct *p, int next_cpu); + void (*migrate_task_rq)(struct task_struct *p); void (*task_waking) (struct task_struct *task); void (*task_woken) (struct rq *this_rq, struct task_struct *task); -- cgit v1.2.3 From 84778472e1b6a27a8931712c40e8cf31143c8f6c Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 2 Sep 2015 01:28:44 -0700 Subject: sched: Export sched_setscheduler_nocheck The new locktorture rtmutex_lock tests exercise priority boosting, which means that they need to set some tasks to real-time priority. To do this, they use sched_setscheduler_nocheck(). However, this is not exported to modules, which results in the following error when building locktorture as a module: ERROR: "sched_setscheduler_nocheck" [kernel/locking/locktorture.ko] undefined! This commit therefore adds an EXPORT_SYMBOL_GPL() to allow this function to be invoked from locktorture when built as a module. Reported-by: Stephen Rothwell Signed-off-by: Davidlohr Bueso Signed-off-by: Paul E. McKenney Acked-by: Ingo Molnar Reviewed-by: Josh Triplett --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2f9c92884817..c4e607873d6f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4022,6 +4022,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy, { return _sched_setscheduler(p, policy, param, false); } +EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) -- cgit v1.2.3 From 2e91fa7f6d451e3ea9fec999065d2fd199691f9d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Oct 2015 16:41:53 -0400 Subject: cgroup: keep zombies associated with their original cgroups cgroup_exit() is called when a task exits and disassociates the exiting task from its cgroups and half-attach it to the root cgroup. This is unnecessary and undesirable. No controller actually needs an exiting task to be disassociated with non-root cgroups. Both cpu and perf_event controllers update the association to the root cgroup from their exit callbacks just to keep consistent with the cgroup core behavior. Also, this disassociation makes it difficult to track resources held by zombies or determine where the zombies came from. Currently, pids controller is completely broken as it uncharges on exit and zombies always escape the resource restriction. With cgroup association being reset on exit, fixing it is pretty painful. There's no reason to reset cgroup membership on exit. The zombie can be removed from its css_set so that it doesn't show up on "cgroup.procs" and thus can't be migrated or interfere with cgroup removal. It can still pin and point to the css_set so that its cgroup membership is maintained. This patch makes cgroup core keep zombies associated with their cgroups at the time of exit. * Previous patches decoupled populated_cnt tracking from css_set lifetime, so a dying task can be simply unlinked from its css_set while pinning and pointing to the css_set. This keeps css_set association from task side alive while hiding it from "cgroup.procs" and populated_cnt tracking. The css_set reference is dropped when the task_struct is freed. * ->exit() callback no longer needs the css arguments as the associated css never changes once PF_EXITING is set. Removed. * cpu and perf_events controllers no longer need ->exit() callbacks. There's no reason to explicitly switch away on exit. The final schedule out is enough. The callbacks are removed. * On traditional hierarchies, nothing changes. "/proc/PID/cgroup" still reports "/" for all zombies. On the default hierarchy, "/proc/PID/cgroup" keeps reporting the cgroup that the task belonged to at the time of exit. If the cgroup gets removed before the task is reaped, " (deleted)" is appended. v2: Build brekage due to missing dummy cgroup_free() when !CONFIG_CGROUP fixed. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo --- Documentation/cgroups/unified-hierarchy.txt | 4 +++ include/linux/cgroup-defs.h | 4 +-- include/linux/cgroup.h | 2 ++ kernel/cgroup.c | 51 +++++++++++++++++++---------- kernel/cgroup_pids.c | 6 ++-- kernel/events/core.c | 16 --------- kernel/fork.c | 1 + kernel/sched/core.c | 16 --------- 8 files changed, 44 insertions(+), 56 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt index 176b940f8327..6932453d37a2 100644 --- a/Documentation/cgroups/unified-hierarchy.txt +++ b/Documentation/cgroups/unified-hierarchy.txt @@ -374,6 +374,10 @@ supported and the interface files "release_agent" and - The "cgroup.clone_children" file is removed. +- /proc/PID/cgroup keeps reporting the cgroup that a zombie belonged + to before exiting. If the cgroup is removed before the zombie is + reaped, " (deleted)" is appeneded to the path. + 5-3. Controller File Conventions diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 62413c3e2f4b..6a1ab64ee5f9 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -435,9 +435,7 @@ struct cgroup_subsys { int (*can_fork)(struct task_struct *task, void **priv_p); void (*cancel_fork)(struct task_struct *task, void *priv); void (*fork)(struct task_struct *task, void *priv); - void (*exit)(struct cgroup_subsys_state *css, - struct cgroup_subsys_state *old_css, - struct task_struct *task); + void (*exit)(struct task_struct *task); void (*bind)(struct cgroup_subsys_state *root_css); int early_init; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 46020735bcbb..22e3754f89c5 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -102,6 +102,7 @@ extern void cgroup_cancel_fork(struct task_struct *p, extern void cgroup_post_fork(struct task_struct *p, void *old_ss_priv[CGROUP_CANFORK_COUNT]); void cgroup_exit(struct task_struct *p); +void cgroup_free(struct task_struct *p); int cgroup_init_early(void); int cgroup_init(void); @@ -547,6 +548,7 @@ static inline void cgroup_cancel_fork(struct task_struct *p, static inline void cgroup_post_fork(struct task_struct *p, void *ss_priv[CGROUP_CANFORK_COUNT]) {} static inline void cgroup_exit(struct task_struct *p) {} +static inline void cgroup_free(struct task_struct *p) {} static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ba7b3284c2e4..918658497625 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5379,14 +5379,34 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "%sname=%s", count ? "," : "", root->name); seq_putc(m, ':'); + cgrp = task_cgroup_from_root(tsk, root); - path = cgroup_path(cgrp, buf, PATH_MAX); - if (!path) { - retval = -ENAMETOOLONG; - goto out_unlock; + + /* + * On traditional hierarchies, all zombie tasks show up as + * belonging to the root cgroup. On the default hierarchy, + * while a zombie doesn't show up in "cgroup.procs" and + * thus can't be migrated, its /proc/PID/cgroup keeps + * reporting the cgroup it belonged to before exiting. If + * the cgroup is removed before the zombie is reaped, + * " (deleted)" is appended to the cgroup path. + */ + if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { + path = cgroup_path(cgrp, buf, PATH_MAX); + if (!path) { + retval = -ENAMETOOLONG; + goto out_unlock; + } + } else { + path = "/"; } + seq_puts(m, path); - seq_putc(m, '\n'); + + if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp)) + seq_puts(m, " (deleted)\n"); + else + seq_putc(m, '\n'); } retval = 0; @@ -5593,7 +5613,6 @@ void cgroup_exit(struct task_struct *tsk) { struct cgroup_subsys *ss; struct css_set *cset; - bool put_cset = false; int i; /* @@ -5606,22 +5625,20 @@ void cgroup_exit(struct task_struct *tsk) spin_lock_bh(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); spin_unlock_bh(&css_set_lock); - put_cset = true; + } else { + get_css_set(cset); } - /* Reassign the task to the init_css_set. */ - RCU_INIT_POINTER(tsk->cgroups, &init_css_set); - /* see cgroup_post_fork() for details */ - for_each_subsys_which(ss, i, &have_exit_callback) { - struct cgroup_subsys_state *old_css = cset->subsys[i]; - struct cgroup_subsys_state *css = task_css(tsk, i); + for_each_subsys_which(ss, i, &have_exit_callback) + ss->exit(tsk); +} - ss->exit(css, old_css, tsk); - } +void cgroup_free(struct task_struct *task) +{ + struct css_set *cset = task_css_set(task); - if (put_cset) - put_css_set(cset); + put_css_set(cset); } static void check_for_release(struct cgroup *cgrp) diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c index 806cd7693ac8..45f0856a61fe 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup_pids.c @@ -266,11 +266,9 @@ static void pids_fork(struct task_struct *task, void *priv) css_put(old_css); } -static void pids_exit(struct cgroup_subsys_state *css, - struct cgroup_subsys_state *old_css, - struct task_struct *task) +static void pids_exit(struct task_struct *task) { - struct pids_cgroup *pids = css_pids(old_css); + struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id)); pids_uncharge(pids, 1); } diff --git a/kernel/events/core.c b/kernel/events/core.c index f548f69c4299..e9874949c787 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9293,25 +9293,9 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css, task_function_call(task, __perf_cgroup_move, task); } -static void perf_cgroup_exit(struct cgroup_subsys_state *css, - struct cgroup_subsys_state *old_css, - struct task_struct *task) -{ - /* - * cgroup_exit() is called in the copy_process() failure path. - * Ignore this case since the task hasn't ran yet, this avoids - * trying to poke a half freed task state from generic code. - */ - if (!(task->flags & PF_EXITING)) - return; - - task_function_call(task, __perf_cgroup_move, task); -} - struct cgroup_subsys perf_event_cgrp_subsys = { .css_alloc = perf_cgroup_css_alloc, .css_free = perf_cgroup_css_free, - .exit = perf_cgroup_exit, .attach = perf_cgroup_attach, }; #endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/fork.c b/kernel/fork.c index 7d5f0f118a63..118743bb5964 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -251,6 +251,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); + cgroup_free(tsk); task_numa_free(tsk); security_task_free(tsk); exit_creds(tsk); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3595403921bd..2cad9ba91036 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8163,21 +8163,6 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css, sched_move_task(task); } -static void cpu_cgroup_exit(struct cgroup_subsys_state *css, - struct cgroup_subsys_state *old_css, - struct task_struct *task) -{ - /* - * cgroup_exit() is called in the copy_process() failure path. - * Ignore this case since the task hasn't ran yet, this avoids - * trying to poke a half freed task state from generic code. - */ - if (!(task->flags & PF_EXITING)) - return; - - sched_move_task(task); -} - #ifdef CONFIG_FAIR_GROUP_SCHED static int cpu_shares_write_u64(struct cgroup_subsys_state *css, struct cftype *cftype, u64 shareval) @@ -8509,7 +8494,6 @@ struct cgroup_subsys cpu_cgrp_subsys = { .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, - .exit = cpu_cgroup_exit, .legacy_cftypes = cpu_files, .early_init = 1, }; -- cgit v1.2.3 From 0baabb385eb4bce699ddab0db015112be6cf1e6a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 12 Oct 2015 17:21:23 +0200 Subject: nohz: Revert "nohz: Set isolcpus when nohz_full is set" This reverts: 8cb9764fc88b ("nohz: Set isolcpus when nohz_full is set") We assumed that full-nohz users always want scheduler isolation on full dynticks CPUs, therefore we included full-nohz CPUs on cpu_isolated_map. This means that tasks run by default on CPUs outside the nohz_full range unless their affinity is explicity overwritten. This suits pure isolation workloads but when the machine is needed to run common workloads, the available sets of CPUs to run common tasks becomes reduced. We reach an extreme case when CONFIG_NO_HZ_FULL_ALL is enabled as it leaves only CPU 0 for non-isolation tasks, which makes people think that their supercomputer regressed to 90's UP - which is true in a sense. Some full-nohz users appear to be interested in running normal workloads either before or after an isolation workload. Full-nohz isn't optimized toward normal workloads but it's still better than UP performance. We are reaching a limitation in kernel presets here. Lets revert this cpu_isolated_map inclusion and let userspace do its own scheduler isolation using cpusets or explicit affinity settings. Reported-by: Ingo Molnar Reported-by: Mike Galbraith Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Chris Metcalf Cc: Christoph Lameter Cc: Dave Jones Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Rik van Riel Link: http://lkml.kernel.org/r/1444663283-30068-1-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 10a8faa1b0d4..5bd7d60658d3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7238,9 +7238,6 @@ void __init sched_init_smp(void) alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); - /* nohz_full won't take effect without isolating the cpus. */ - tick_nohz_full_add_cpus_to(cpu_isolated_map); - sched_init_numa(); /* -- cgit v1.2.3 From 07f06cb3b5f6bd21374a48dbefdb431d71d53974 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Oct 2015 18:00:54 +0200 Subject: sched: Start stopper early Ensure the stopper thread is active 'early', because the load balancer pretty much assumes that its available. And when 'online && active' the load-balancer is fully available. Not only the numa balancing stop_two_cpus() caller relies on it, but also the self migration stuff does, and at CPU_ONLINE time the cpu really is 'free' to run anything. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Tejun Heo Cc: Thomas Gleixner Cc: heiko.carstens@de.ibm.com Link: http://lkml.kernel.org/r/20151009160054.GA10176@redhat.com Signed-off-by: Ingo Molnar --- kernel/cpu.c | 1 - kernel/sched/core.c | 12 +++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/cpu.c b/kernel/cpu.c index 6467521e1e15..c85df2775b73 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -475,7 +475,6 @@ static int smpboot_thread_call(struct notifier_block *nfb, case CPU_DOWN_FAILED: case CPU_ONLINE: - stop_machine_unpark(cpu); smpboot_unpark_threads(cpu); break; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f45a7c70f264..7ee8caea1195 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5545,21 +5545,27 @@ static void set_cpu_rq_start_time(void) static int sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { + int cpu = (long)hcpu; + switch (action & ~CPU_TASKS_FROZEN) { case CPU_STARTING: set_cpu_rq_start_time(); return NOTIFY_OK; + case CPU_ONLINE: /* * At this point a starting CPU has marked itself as online via * set_cpu_online(). But it might not yet have marked itself * as active, which is essential from here on. - * - * Thus, fall-through and help the starting CPU along. */ + set_cpu_active(cpu, true); + stop_machine_unpark(cpu); + return NOTIFY_OK; + case CPU_DOWN_FAILED: - set_cpu_active((long)hcpu, true); + set_cpu_active(cpu, true); return NOTIFY_OK; + default: return NOTIFY_DONE; } -- cgit v1.2.3 From 62694cd51322262a9142e946915fc4783113ccff Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Oct 2015 18:36:29 +0200 Subject: sched: Move cpu_active() tests from stop_two_cpus() into migrate_swap_stop() The cpu_active() tests are not fundamentally part of stop_two_cpus(), move then into the scheduler where they belong. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++++ kernel/stop_machine.c | 9 --------- 2 files changed, 4 insertions(+), 9 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7ee8caea1195..a7b368e51f69 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1335,12 +1335,16 @@ static int migrate_swap_stop(void *data) struct rq *src_rq, *dst_rq; int ret = -EAGAIN; + if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu)) + return -EAGAIN; + src_rq = cpu_rq(arg->src_cpu); dst_rq = cpu_rq(arg->dst_cpu); double_raw_lock(&arg->src_task->pi_lock, &arg->dst_task->pi_lock); double_rq_lock(src_rq, dst_rq); + if (task_cpu(arg->dst_task) != arg->dst_cpu) goto unlock; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index e5a09d2dc575..867bc20e1ef1 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -275,15 +275,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * cpu_stop_init_done(&done, 2); set_state(&msdata, MULTI_STOP_PREPARE); - /* - * We do not want to migrate to inactive CPU. FIXME: move this - * into migrate_swap_stop() callback. - */ - if (!cpu_active(cpu1) || !cpu_active(cpu2)) { - preempt_enable(); - return -ENOENT; - } - if (cpu1 > cpu2) swap(cpu1, cpu2); if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) { -- cgit v1.2.3 From e73e85f0593832aa583b252f9a16cf90ed6d30fa Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 10 Oct 2015 20:53:15 +0200 Subject: sched: Don't scan all-offline ->cpus_allowed twice if !CONFIG_CPUSETS If CONFIG_CPUSETS=n then "case cpuset" changes the state and runs the already failed for_each_cpu() loop again for no reason. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vitaly Kuznetsov Cc: heiko.carstens@de.ibm.com Link: http://lkml.kernel.org/r/20151010185315.GA24100@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a7b368e51f69..b4d263db52a6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1580,13 +1580,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p) goto out; } + /* No more Mr. Nice Guy. */ switch (state) { case cpuset: - /* No more Mr. Nice Guy. */ - cpuset_cpus_allowed_fallback(p); - state = possible; - break; - + if (IS_ENABLED(CONFIG_CPUSETS)) { + cpuset_cpus_allowed_fallback(p); + state = possible; + break; + } + /* fall-through */ case possible: do_set_cpus_allowed(p, cpu_possible_mask); state = fail; -- cgit v1.2.3 From 0aaafaabfcba8aa991913cd3280a5dbf7f111a2a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 23 Oct 2015 11:50:08 +0200 Subject: sched/core: Add missing lockdep_unpin() annotations Luca and Wanpeng reported two missing annotations that led to false lockdep complaints. Add the missing annotations. Reported-by: Luca Abeni Reported-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: cbce1a686700 ("sched,lockdep: Employ lock pinning") Link: http://lkml.kernel.org/r/20151023095008.GY17308@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 ++++++++- kernel/sched/deadline.c | 9 ++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5bd7d60658d3..bcd214e4b4d6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2366,8 +2366,15 @@ void wake_up_new_task(struct task_struct *p) trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP - if (p->sched_class->task_woken) + if (p->sched_class->task_woken) { + /* + * Nothing relies on rq->lock after this, so its fine to + * drop it. + */ + lockdep_unpin_lock(&rq->lock); p->sched_class->task_woken(rq, p); + lockdep_pin_lock(&rq->lock); + } #endif task_rq_unlock(rq, p, &flags); } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 142df2668e5d..8b0a15e285f9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -668,8 +668,15 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * Queueing this task back might have overloaded rq, check if we need * to kick someone away. */ - if (has_pushable_dl_tasks(rq)) + if (has_pushable_dl_tasks(rq)) { + /* + * Nothing relies on rq->lock after this, so its safe to drop + * rq->lock. + */ + lockdep_unpin_lock(&rq->lock); push_dl_task(rq); + lockdep_pin_lock(&rq->lock); + } #endif unlock: -- cgit v1.2.3