From 4698f88c06b893f2acc0b443004a53bf490fde7c Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 7 Jun 2016 14:43:16 -0500 Subject: sched/debug: Fix 'schedstats=enable' cmdline option The 'schedstats=enable' option doesn't work, and also produces the following warning during boot: WARNING: CPU: 0 PID: 0 at /home/jpoimboe/git/linux/kernel/jump_label.c:61 static_key_slow_inc+0x8c/0xa0 static_key_slow_inc used before call to jump_label_init Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 4.7.0-rc1+ #25 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.8.1-20150318_183358- 04/01/2014 0000000000000086 3ae3475a4bea95d4 ffffffff81e03da8 ffffffff8143fc83 ffffffff81e03df8 0000000000000000 ffffffff81e03de8 ffffffff810b1ffb 0000003d00000096 ffffffff823514d0 ffff88007ff197c8 0000000000000000 Call Trace: [] dump_stack+0x85/0xc2 [] __warn+0xcb/0xf0 [] warn_slowpath_fmt+0x5f/0x80 [] static_key_slow_inc+0x8c/0xa0 [] static_key_enable+0x16/0x40 [] setup_schedstats+0x29/0x94 [] unknown_bootoption+0x89/0x191 [] parse_args+0x297/0x4b0 [] start_kernel+0x1d8/0x4a9 [] ? set_init_arg+0x55/0x55 [] ? early_idt_handler_array+0x120/0x120 [] x86_64_start_reservations+0x2f/0x31 [] x86_64_start_kernel+0x14a/0x16d The problem is that it tries to update the 'sched_schedstats' static key before jump labels have been initialized. Changing jump_label_init() to be called earlier before parse_early_param() wouldn't fix it: it would still fail trying to poke_text() because mm isn't yet initialized. Instead, just create a temporary '__sched_schedstats' variable which can be copied to the static key later during sched_init() after jump labels have been initialized. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Matt Fleming Cc: Mel Gorman Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Thomas Gleixner Fixes: cb2517653fcc ("sched/debug: Make schedstats a runtime tunable that is disabled by default") Link: http://lkml.kernel.org/r/453775fe3433bed65731a583e228ccea806d18cd.1465322027.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f2cae4620c7..385c947482e1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2253,9 +2253,11 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, #endif #endif +#ifdef CONFIG_SCHEDSTATS + DEFINE_STATIC_KEY_FALSE(sched_schedstats); +static bool __initdata __sched_schedstats = false; -#ifdef CONFIG_SCHEDSTATS static void set_schedstats(bool enabled) { if (enabled) @@ -2278,11 +2280,16 @@ static int __init setup_schedstats(char *str) if (!str) goto out; + /* + * This code is called before jump labels have been set up, so we can't + * change the static branch directly just yet. Instead set a temporary + * variable so init_schedstats() can do it later. + */ if (!strcmp(str, "enable")) { - set_schedstats(true); + __sched_schedstats = true; ret = 1; } else if (!strcmp(str, "disable")) { - set_schedstats(false); + __sched_schedstats = false; ret = 1; } out: @@ -2293,6 +2300,11 @@ out: } __setup("schedstats=", setup_schedstats); +static void __init init_schedstats(void) +{ + set_schedstats(__sched_schedstats); +} + #ifdef CONFIG_PROC_SYSCTL int sysctl_schedstats(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -2313,8 +2325,10 @@ int sysctl_schedstats(struct ctl_table *table, int write, set_schedstats(state); return err; } -#endif -#endif +#endif /* CONFIG_PROC_SYSCTL */ +#else /* !CONFIG_SCHEDSTATS */ +static inline void init_schedstats(void) {} +#endif /* CONFIG_SCHEDSTATS */ /* * fork()/clone()-time setup: @@ -7487,6 +7501,8 @@ void __init sched_init(void) #endif init_sched_fair_class(); + init_schedstats(); + scheduler_running = 1; } -- cgit v1.2.3 From 29d6455178a09e1dc340380c582b13356227e8df Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 1 Jun 2016 11:55:07 +0200 Subject: sched: panic on corrupted stack end Until now, hitting this BUG_ON caused a recursive oops (because oops handling involves do_exit(), which calls into the scheduler, which in turn raises an oops), which caused stuff below the stack to be overwritten until a panic happened (e.g. via an oops in interrupt context, caused by the overwritten CPU index in the thread_info). Just panic directly. Signed-off-by: Jann Horn Signed-off-by: Linus Torvalds --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d1f7149f8704..11546a6ed5df 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3047,7 +3047,8 @@ static noinline void __schedule_bug(struct task_struct *prev) static inline void schedule_debug(struct task_struct *prev) { #ifdef CONFIG_SCHED_STACK_END_CHECK - BUG_ON(task_stack_end_corrupted(prev)); + if (task_stack_end_corrupted(prev)) + panic("corrupted stack end detected inside scheduler\n"); #endif if (unlikely(in_atomic_preempt_off())) { -- cgit v1.2.3 From b7fa30c9cc48c4f55663420472505d3b4f6e1705 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 9 Jun 2016 15:07:50 +0200 Subject: sched/fair: Fix post_init_entity_util_avg() serialization Chris Wilson reported a divide by 0 at: post_init_entity_util_avg(): > 725 if (cfs_rq->avg.util_avg != 0) { > 726 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; > -> 727 sa->util_avg /= (cfs_rq->avg.load_avg + 1); > 728 > 729 if (sa->util_avg > cap) > 730 sa->util_avg = cap; > 731 } else { Which given the lack of serialization, and the code generated from update_cfs_rq_load_avg() is entirely possible: if (atomic_long_read(&cfs_rq->removed_load_avg)) { s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); sa->load_avg = max_t(long, sa->load_avg - r, 0); sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); removed_load = 1; } turns into: ffffffff81087064: 49 8b 85 98 00 00 00 mov 0x98(%r13),%rax ffffffff8108706b: 48 85 c0 test %rax,%rax ffffffff8108706e: 74 40 je ffffffff810870b0 ffffffff81087070: 4c 89 f8 mov %r15,%rax ffffffff81087073: 49 87 85 98 00 00 00 xchg %rax,0x98(%r13) ffffffff8108707a: 49 29 45 70 sub %rax,0x70(%r13) ffffffff8108707e: 4c 89 f9 mov %r15,%rcx ffffffff81087081: bb 01 00 00 00 mov $0x1,%ebx ffffffff81087086: 49 83 7d 70 00 cmpq $0x0,0x70(%r13) ffffffff8108708b: 49 0f 49 4d 70 cmovns 0x70(%r13),%rcx Which you'll note ends up with 'sa->load_avg - r' in memory at ffffffff8108707a. By calling post_init_entity_util_avg() under rq->lock we're sure to be fully serialized against PELT updates and cannot observe intermediate state like this. Reported-by: Chris Wilson Signed-off-by: Peter Zijlstra (Intel) Cc: Andrey Ryabinin Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yuyang Du Cc: bsegall@google.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: steve.muckle@linaro.org Fixes: 2b8c41daba32 ("sched/fair: Initiate a new task's util avg to a bounded value") Link: http://lkml.kernel.org/r/20160609130750.GQ30909@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +-- kernel/sched/fair.c | 8 +++++++- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 017d5394f5dc..13d0896aff87 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2535,10 +2535,9 @@ void wake_up_new_task(struct task_struct *p) */ set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif - /* Post initialize new task's util average when its cfs_rq is set */ + rq = __task_rq_lock(p, &rf); post_init_entity_util_avg(&p->se); - rq = __task_rq_lock(p, &rf); activate_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 218f8e83db73..4e33ad12bb68 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8496,8 +8496,9 @@ void free_fair_sched_group(struct task_group *tg) int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { - struct cfs_rq *cfs_rq; struct sched_entity *se; + struct cfs_rq *cfs_rq; + struct rq *rq; int i; tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8512,6 +8513,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_bandwidth(tg_cfs_bandwidth(tg)); for_each_possible_cpu(i) { + rq = cpu_rq(i); + cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) @@ -8525,7 +8528,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); init_entity_runnable_average(se); + + raw_spin_lock_irq(&rq->lock); post_init_entity_util_avg(se); + raw_spin_unlock_irq(&rq->lock); } return 1; -- cgit v1.2.3 From 57675cb976eff977aefb428e68e4e0236d48a9ff Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 9 Jun 2016 15:20:05 +0300 Subject: kernel/sysrq, watchdog, sched/core: Reset watchdog on all CPUs while processing sysrq-w Lengthy output of sysrq-w may take a lot of time on slow serial console. Currently we reset NMI-watchdog on the current CPU to avoid spurious lockup messages. Sometimes this doesn't work since softlockup watchdog might trigger on another CPU which is waiting for an IPI to proceed. We reset softlockup watchdogs on all CPUs, but we do this only after listing all tasks, and this may be too late on a busy system. So, reset watchdogs CPUs earlier, in for_each_process_thread() loop. Signed-off-by: Andrey Ryabinin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Link: http://lkml.kernel.org/r/1465474805-14641-1-git-send-email-aryabinin@virtuozzo.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 13d0896aff87..4135ac83cb65 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5147,14 +5147,16 @@ void show_state_filter(unsigned long state_filter) /* * reset the NMI-timeout, listing all files on a slow * console might take a lot of time: + * Also, reset softlockup watchdogs on all CPUs, because + * another CPU might be blocked waiting for us to process + * an IPI. */ touch_nmi_watchdog(); + touch_all_softlockup_watchdogs(); if (!state_filter || (p->state & state_filter)) sched_show_task(p); } - touch_all_softlockup_watchdogs(); - #ifdef CONFIG_SCHED_DEBUG if (!state_filter) sysrq_sched_debug_show(); -- cgit v1.2.3 From feb245e304f343cf5e4f9123db36354144dce8a4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 16 Jun 2016 15:35:04 -0400 Subject: sched/core: Allow kthreads to fall back to online && !active cpus During CPU hotplug, CPU_ONLINE callbacks are run while the CPU is online but not active. A CPU_ONLINE callback may create or bind a kthread so that its cpus_allowed mask only allows the CPU which is being brought online. The kthread may start executing before the CPU is made active and can end up in select_fallback_rq(). In such cases, the expected behavior is selecting the CPU which is coming online; however, because select_fallback_rq() only chooses from active CPUs, it determines that the task doesn't have any viable CPU in its allowed mask and ends up overriding it to cpu_possible_mask. CPU_ONLINE callbacks should be able to put kthreads on the CPU which is coming online. Update select_fallback_rq() so that it follows cpu_online() rather than cpu_active() for kthreads. Reported-by: Gautham R Shenoy Tested-by: Gautham R. Shenoy Signed-off-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Cc: Abdul Haleem Cc: Aneesh Kumar Cc: Linus Torvalds Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-team@fb.com Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20160616193504.GB3262@mtj.duckdns.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4135ac83cb65..51d7105f529a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1536,7 +1536,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) for (;;) { /* Any allowed, online CPU? */ for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { - if (!cpu_active(dest_cpu)) + if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu)) + continue; + if (!cpu_online(dest_cpu)) continue; goto out; } -- cgit v1.2.3