diff options
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r-- | kernel/sched/core.c | 352 |
1 files changed, 180 insertions, 172 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ca2bb629595f..5226cc26a095 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -58,7 +58,17 @@ const_debug unsigned int sysctl_sched_features = #include "features.h" 0; #undef SCHED_FEAT -#endif + +/* + * Print a warning if need_resched is set for the given duration (if + * LATENCY_WARN is enabled). + * + * If sysctl_resched_latency_warn_once is set, only one warning will be shown + * per boot. + */ +__read_mostly int sysctl_resched_latency_warn_ms = 100; +__read_mostly int sysctl_resched_latency_warn_once = 1; +#endif /* CONFIG_SCHED_DEBUG */ /* * Number of tasks to iterate in a single balance run. @@ -737,7 +747,7 @@ static void nohz_csd_func(void *info) /* * Release the rq::nohz_csd. */ - flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); + flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEWILB_KICK, nohz_flags(cpu)); WARN_ON(!(flags & NOHZ_KICK_MASK)); rq->idle_balance = idle_cpu(cpu); @@ -928,7 +938,7 @@ DEFINE_STATIC_KEY_FALSE(sched_uclamp_used); static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) { - return clamp_value / UCLAMP_BUCKET_DELTA; + return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1); } static inline unsigned int uclamp_none(enum uclamp_id clamp_id) @@ -1811,7 +1821,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) return cpu_online(cpu); /* Regular kernel threads don't get to stay during offline. */ - if (cpu_rq(cpu)->balance_push) + if (cpu_dying(cpu)) return false; /* But are allowed during online. */ @@ -1862,8 +1872,13 @@ struct migration_arg { struct set_affinity_pending *pending; }; +/* + * @refs: number of wait_for_completion() + * @stop_pending: is @stop_work in use + */ struct set_affinity_pending { refcount_t refs; + unsigned int stop_pending; struct completion done; struct cpu_stop_work stop_work; struct migration_arg arg; @@ -1898,8 +1913,8 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, */ static int migration_cpu_stop(void *data) { - struct set_affinity_pending *pending; struct migration_arg *arg = data; + struct set_affinity_pending *pending = arg->pending; struct task_struct *p = arg->task; int dest_cpu = arg->dest_cpu; struct rq *rq = this_rq(); @@ -1921,7 +1936,12 @@ static int migration_cpu_stop(void *data) raw_spin_lock(&p->pi_lock); rq_lock(rq, &rf); - pending = p->migration_pending; + /* + * If we were passed a pending, then ->stop_pending was set, thus + * p->migration_pending must have remained stable. + */ + WARN_ON_ONCE(pending && pending != p->migration_pending); + /* * If task_rq(p) != rq, it cannot be migrated here, because we're * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because @@ -1936,17 +1956,9 @@ static int migration_cpu_stop(void *data) complete = true; } - /* migrate_enable() -- we must not race against SCA */ if (dest_cpu < 0) { - /* - * When this was migrate_enable() but we no longer - * have a @pending, a concurrent SCA 'fixed' things - * and we should be valid again. Nothing to do. - */ - if (!pending) { - WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)); + if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) goto out; - } dest_cpu = cpumask_any_distribute(&p->cpus_mask); } @@ -1956,7 +1968,14 @@ static int migration_cpu_stop(void *data) else p->wake_cpu = dest_cpu; - } else if (dest_cpu < 0 || pending) { + /* + * XXX __migrate_task() can fail, at which point we might end + * up running on a dodgy CPU, AFAICT this can only happen + * during CPU hotplug, at which point we'll get pushed out + * anyway, so it's probably not a big deal. + */ + + } else if (pending) { /* * This happens when we get migrated between migrate_enable()'s * preempt_enable() and scheduling the stopper task. At that @@ -1971,43 +1990,31 @@ static int migration_cpu_stop(void *data) * ->pi_lock, so the allowed mask is stable - if it got * somewhere allowed, we're done. */ - if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) { + if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) { p->migration_pending = NULL; complete = true; goto out; } /* - * When this was migrate_enable() but we no longer have an - * @pending, a concurrent SCA 'fixed' things and we should be - * valid again. Nothing to do. - */ - if (!pending) { - WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)); - goto out; - } - - /* * When migrate_enable() hits a rq mis-match we can't reliably * determine is_migration_disabled() and so have to chase after * it. */ + WARN_ON_ONCE(!pending->stop_pending); task_rq_unlock(rq, p, &rf); stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, &pending->arg, &pending->stop_work); return 0; } out: + if (pending) + pending->stop_pending = false; task_rq_unlock(rq, p, &rf); if (complete) complete_all(&pending->done); - /* For pending->{arg,stop_work} */ - pending = arg->pending; - if (pending && refcount_dec_and_test(&pending->refs)) - wake_up_var(&pending->refs); - return 0; } @@ -2172,16 +2179,21 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) * * (1) In the cases covered above. There is one more where the completion is * signaled within affine_move_task() itself: when a subsequent affinity request - * cancels the need for an active migration. Consider: + * occurs after the stopper bailed out due to the targeted task still being + * Migrate-Disable. Consider: * * Initial conditions: P0->cpus_mask = [0, 1] * - * P0@CPU0 P1 P2 - * - * migrate_disable(); - * <preempted> + * CPU0 P1 P2 + * <P0> + * migrate_disable(); + * <preempted> * set_cpus_allowed_ptr(P0, [1]); * <blocks> + * <migration/0> + * migration_cpu_stop() + * is_migration_disabled() + * <bails> * set_cpus_allowed_ptr(P0, [0, 1]); * <signal completion> * <awakes> @@ -2194,11 +2206,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag int dest_cpu, unsigned int flags) { struct set_affinity_pending my_pending = { }, *pending = NULL; - struct migration_arg arg = { - .task = p, - .dest_cpu = dest_cpu, - }; - bool complete = false; + bool stop_pending, complete = false; /* Can the task run on the task's current CPU? If so, we're done */ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { @@ -2210,12 +2218,16 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag push_task = get_task_struct(p); } + /* + * If there are pending waiters, but no pending stop_work, + * then complete now. + */ pending = p->migration_pending; - if (pending) { - refcount_inc(&pending->refs); + if (pending && !pending->stop_pending) { p->migration_pending = NULL; complete = true; } + task_rq_unlock(rq, p, rf); if (push_task) { @@ -2224,7 +2236,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag } if (complete) - goto do_complete; + complete_all(&pending->done); return 0; } @@ -2235,6 +2247,12 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag /* Install the request */ refcount_set(&my_pending.refs, 1); init_completion(&my_pending.done); + my_pending.arg = (struct migration_arg) { + .task = p, + .dest_cpu = -1, /* any */ + .pending = &my_pending, + }; + p->migration_pending = &my_pending; } else { pending = p->migration_pending; @@ -2259,45 +2277,41 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag return -EINVAL; } - if (flags & SCA_MIGRATE_ENABLE) { - - refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ - p->migration_flags &= ~MDF_PUSH; - task_rq_unlock(rq, p, rf); - - pending->arg = (struct migration_arg) { - .task = p, - .dest_cpu = -1, - .pending = pending, - }; - - stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, - &pending->arg, &pending->stop_work); - - return 0; - } - if (task_running(rq, p) || p->state == TASK_WAKING) { /* - * Lessen races (and headaches) by delegating - * is_migration_disabled(p) checks to the stopper, which will - * run on the same CPU as said p. + * MIGRATE_ENABLE gets here because 'p == current', but for + * anything else we cannot do is_migration_disabled(), punt + * and have the stopper function handle it all race-free. */ + stop_pending = pending->stop_pending; + if (!stop_pending) + pending->stop_pending = true; + + if (flags & SCA_MIGRATE_ENABLE) + p->migration_flags &= ~MDF_PUSH; + task_rq_unlock(rq, p, rf); - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); + if (!stop_pending) { + stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, + &pending->arg, &pending->stop_work); + } + + if (flags & SCA_MIGRATE_ENABLE) + return 0; } else { if (!is_migration_disabled(p)) { if (task_on_rq_queued(p)) rq = move_queued_task(rq, rf, p, dest_cpu); - p->migration_pending = NULL; - complete = true; + if (!pending->stop_pending) { + p->migration_pending = NULL; + complete = true; + } } task_rq_unlock(rq, p, rf); -do_complete: if (complete) complete_all(&pending->done); } @@ -2305,7 +2319,7 @@ do_complete: wait_for_completion(&pending->done); if (refcount_dec_and_test(&pending->refs)) - wake_up_var(&pending->refs); + wake_up_var(&pending->refs); /* No UaF, just an address */ /* * Block the original owner of &pending until all subsequent callers @@ -2313,6 +2327,9 @@ do_complete: */ wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs)); + /* ARGH */ + WARN_ON_ONCE(my_pending.stop_pending); + return 0; } @@ -4246,8 +4263,6 @@ static struct rq *finish_task_switch(struct task_struct *prev) asmlinkage __visible void schedule_tail(struct task_struct *prev) __releases(rq->lock) { - struct rq *rq; - /* * New tasks start with FORK_PREEMPT_COUNT, see there and * finish_task_switch() for details. @@ -4257,7 +4272,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) * PREEMPT_COUNT kernels). */ - rq = finish_task_switch(prev); + finish_task_switch(prev); preempt_enable(); if (current->set_child_tid) @@ -4522,6 +4537,55 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } +#ifdef CONFIG_SCHED_DEBUG +static u64 cpu_resched_latency(struct rq *rq) +{ + int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms); + u64 resched_latency, now = rq_clock(rq); + static bool warned_once; + + if (sysctl_resched_latency_warn_once && warned_once) + return 0; + + if (!need_resched() || !latency_warn_ms) + return 0; + + if (system_state == SYSTEM_BOOTING) + return 0; + + if (!rq->last_seen_need_resched_ns) { + rq->last_seen_need_resched_ns = now; + rq->ticks_without_resched = 0; + return 0; + } + + rq->ticks_without_resched++; + resched_latency = now - rq->last_seen_need_resched_ns; + if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC) + return 0; + + warned_once = true; + + return resched_latency; +} + +static int __init setup_resched_latency_warn_ms(char *str) +{ + long val; + + if ((kstrtol(str, 0, &val))) { + pr_warn("Unable to set resched_latency_warn_ms\n"); + return 1; + } + + sysctl_resched_latency_warn_ms = val; + return 1; +} +__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms); +#else +static inline u64 cpu_resched_latency(struct rq *rq) { return 0; } +#endif /* CONFIG_SCHED_DEBUG */ + /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -4533,6 +4597,7 @@ void scheduler_tick(void) struct task_struct *curr = rq->curr; struct rq_flags rf; unsigned long thermal_pressure; + u64 resched_latency; arch_scale_freq_tick(); sched_clock_tick(); @@ -4543,11 +4608,15 @@ void scheduler_tick(void) thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); curr->sched_class->task_tick(rq, curr, 0); + if (sched_feat(LATENCY_WARN)) + resched_latency = cpu_resched_latency(rq); calc_global_load_tick(rq); - psi_task_tick(rq); rq_unlock(rq, &rf); + if (sched_feat(LATENCY_WARN) && resched_latency) + resched_latency_warn(cpu, resched_latency); + perf_event_task_tick(); #ifdef CONFIG_SMP @@ -5042,6 +5111,9 @@ static void __sched notrace __schedule(bool preempt) next = pick_next_task(rq, prev, &rf); clear_tsk_need_resched(prev); clear_preempt_need_resched(); +#ifdef CONFIG_SCHED_DEBUG + rq->last_seen_need_resched_ns = 0; +#endif if (likely(prev != next)) { rq->nr_switches++; @@ -5367,23 +5439,23 @@ enum { preempt_dynamic_full, }; -static int preempt_dynamic_mode = preempt_dynamic_full; +int preempt_dynamic_mode = preempt_dynamic_full; -static int sched_dynamic_mode(const char *str) +int sched_dynamic_mode(const char *str) { if (!strcmp(str, "none")) - return 0; + return preempt_dynamic_none; if (!strcmp(str, "voluntary")) - return 1; + return preempt_dynamic_voluntary; if (!strcmp(str, "full")) - return 2; + return preempt_dynamic_full; - return -1; + return -EINVAL; } -static void sched_dynamic_update(int mode) +void sched_dynamic_update(int mode) { /* * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in @@ -5398,25 +5470,25 @@ static void sched_dynamic_update(int mode) switch (mode) { case preempt_dynamic_none: static_call_update(cond_resched, __cond_resched); - static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0); - static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL); - static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL); - static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL); + static_call_update(might_resched, (void *)&__static_call_return0); + static_call_update(preempt_schedule, NULL); + static_call_update(preempt_schedule_notrace, NULL); + static_call_update(irqentry_exit_cond_resched, NULL); pr_info("Dynamic Preempt: none\n"); break; case preempt_dynamic_voluntary: static_call_update(cond_resched, __cond_resched); static_call_update(might_resched, __cond_resched); - static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL); - static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL); - static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL); + static_call_update(preempt_schedule, NULL); + static_call_update(preempt_schedule_notrace, NULL); + static_call_update(irqentry_exit_cond_resched, NULL); pr_info("Dynamic Preempt: voluntary\n"); break; case preempt_dynamic_full: - static_call_update(cond_resched, (typeof(&__cond_resched)) __static_call_return0); - static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0); + static_call_update(cond_resched, (void *)&__static_call_return0); + static_call_update(might_resched, (void *)&__static_call_return0); static_call_update(preempt_schedule, __preempt_schedule_func); static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func); static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched); @@ -5440,77 +5512,8 @@ static int __init setup_preempt_mode(char *str) } __setup("preempt=", setup_preempt_mode); -#ifdef CONFIG_SCHED_DEBUG - -static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[16]; - int mode; - - if (cnt > 15) - cnt = 15; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - mode = sched_dynamic_mode(strstrip(buf)); - if (mode < 0) - return mode; - - sched_dynamic_update(mode); - - *ppos += cnt; - - return cnt; -} - -static int sched_dynamic_show(struct seq_file *m, void *v) -{ - static const char * preempt_modes[] = { - "none", "voluntary", "full" - }; - int i; - - for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) { - if (preempt_dynamic_mode == i) - seq_puts(m, "("); - seq_puts(m, preempt_modes[i]); - if (preempt_dynamic_mode == i) - seq_puts(m, ")"); - - seq_puts(m, " "); - } - - seq_puts(m, "\n"); - return 0; -} - -static int sched_dynamic_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, sched_dynamic_show, NULL); -} - -static const struct file_operations sched_dynamic_fops = { - .open = sched_dynamic_open, - .write = sched_dynamic_write, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static __init int sched_init_debug_dynamic(void) -{ - debugfs_create_file("sched_preempt", 0644, NULL, NULL, &sched_dynamic_fops); - return 0; -} -late_initcall(sched_init_debug_dynamic); - -#endif /* CONFIG_SCHED_DEBUG */ #endif /* CONFIG_PREEMPT_DYNAMIC */ - /* * This is the entry point to schedule() from kernel preemption * off of irq context. @@ -6386,6 +6389,7 @@ int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) { return __sched_setscheduler(p, attr, false, true); } +EXPORT_SYMBOL_GPL(sched_setattr_nocheck); /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. @@ -7634,6 +7638,9 @@ static DEFINE_PER_CPU(struct cpu_stop_work, push_work); /* * Ensure we only run per-cpu kthreads once the CPU goes !active. + * + * This is enabled below SCHED_AP_ACTIVE; when !cpu_active(), but only + * effective when the hotplug motion is down. */ static void balance_push(struct rq *rq) { @@ -7641,12 +7648,19 @@ static void balance_push(struct rq *rq) lockdep_assert_held(&rq->lock); SCHED_WARN_ON(rq->cpu != smp_processor_id()); + /* * Ensure the thing is persistent until balance_push_set(.on = false); */ rq->balance_callback = &balance_push_callback; /* + * Only active while going offline. + */ + if (!cpu_dying(rq->cpu)) + return; + + /* * Both the cpu-hotplug and stop task are in this case and are * required to complete the hotplug process. * @@ -7654,7 +7668,7 @@ static void balance_push(struct rq *rq) * histerical raisins. */ if (rq->idle == push_task || - ((push_task->flags & PF_KTHREAD) && kthread_is_per_cpu(push_task)) || + kthread_is_per_cpu(push_task) || is_migration_disabled(push_task)) { /* @@ -7699,7 +7713,6 @@ static void balance_push_set(int cpu, bool on) struct rq_flags rf; rq_lock_irqsave(rq, &rf); - rq->balance_push = on; if (on) { WARN_ON_ONCE(rq->balance_callback); rq->balance_callback = &balance_push_callback; @@ -7824,8 +7837,8 @@ int sched_cpu_activate(unsigned int cpu) struct rq_flags rf; /* - * Make sure that when the hotplug state machine does a roll-back - * we clear balance_push. Ideally that would happen earlier... + * Clear the balance_push callback and prepare to schedule + * regular tasks. */ balance_push_set(cpu, false); @@ -8010,12 +8023,6 @@ int sched_cpu_dying(unsigned int cpu) } rq_unlock_irqrestore(rq, &rf); - /* - * Now that the CPU is offline, make sure we're welcome - * to new tasks once we come back up. - */ - balance_push_set(cpu, false); - calc_load_migrate(rq); update_max_interval(); hrtick_clear(rq); @@ -8200,7 +8207,7 @@ void __init sched_init(void) rq->sd = NULL; rq->rd = NULL; rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; - rq->balance_callback = NULL; + rq->balance_callback = &balance_push_callback; rq->active_balance = 0; rq->next_balance = jiffies; rq->push_cpu = 0; @@ -8247,6 +8254,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP idle_thread_set_boot_cpu(); + balance_push_set(smp_processor_id(), false); #endif init_sched_fair_class(); @@ -8971,7 +8979,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) return -EINVAL; /* - * Likewise, bound things on the otherside by preventing insane quota + * Likewise, bound things on the other side by preventing insane quota * periods. This also allows us to normalize in computing quota * feasibility. */ |