diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/core.c | 379 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 3 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 59 | ||||
-rw-r--r-- | kernel/sched/debug.c | 7 | ||||
-rw-r--r-- | kernel/sched/ext.c | 1578 | ||||
-rw-r--r-- | kernel/sched/ext.h | 2 | ||||
-rw-r--r-- | kernel/sched/fair.c | 94 | ||||
-rw-r--r-- | kernel/sched/features.h | 3 | ||||
-rw-r--r-- | kernel/sched/idle.c | 5 | ||||
-rw-r--r-- | kernel/sched/pelt.c | 2 | ||||
-rw-r--r-- | kernel/sched/psi.c | 26 | ||||
-rw-r--r-- | kernel/sched/rt.c | 67 | ||||
-rw-r--r-- | kernel/sched/sched.h | 209 | ||||
-rw-r--r-- | kernel/sched/stats.h | 49 | ||||
-rw-r--r-- | kernel/sched/syscalls.c | 59 | ||||
-rw-r--r-- | kernel/sched/wait_bit.c | 90 |
16 files changed, 1659 insertions, 973 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b6cc1cf499d6..95e40895a519 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -548,6 +548,11 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } * ON_RQ_MIGRATING state is used for migration without holding both * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). * + * Additionally it is possible to be ->on_rq but still be considered not + * runnable when p->se.sched_delayed is true. These tasks are on the runqueue + * but will be dequeued as soon as they get picked again. See the + * task_is_runnable() helper. + * * p->on_cpu <- { 0, 1 }: * * is set by prepare_task() and cleared by finish_task() such that it will be @@ -827,7 +832,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) rq_lock(rq, &rf); update_rq_clock(rq); - rq->curr->sched_class->task_tick(rq, rq->curr, 1); + rq->donor->sched_class->task_tick(rq, rq->curr, 1); rq_unlock(rq, &rf); return HRTIMER_NORESTART; @@ -936,10 +941,9 @@ static inline void hrtick_rq_init(struct rq *rq) * this avoids any races wrt polling state changes and thereby avoids * spurious IPIs. */ -static inline bool set_nr_and_not_polling(struct task_struct *p) +static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif) { - struct thread_info *ti = task_thread_info(p); - return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); + return !(fetch_or(&ti->flags, 1 << tif) & _TIF_POLLING_NRFLAG); } /* @@ -964,9 +968,9 @@ static bool set_nr_if_polling(struct task_struct *p) } #else -static inline bool set_nr_and_not_polling(struct task_struct *p) +static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif) { - set_tsk_need_resched(p); + set_ti_thread_flag(ti, tif); return true; } @@ -1071,28 +1075,70 @@ void wake_up_q(struct wake_q_head *head) * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ -void resched_curr(struct rq *rq) +static void __resched_curr(struct rq *rq, int tif) { struct task_struct *curr = rq->curr; + struct thread_info *cti = task_thread_info(curr); int cpu; lockdep_assert_rq_held(rq); - if (test_tsk_need_resched(curr)) + /* + * Always immediately preempt the idle task; no point in delaying doing + * actual work. + */ + if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY) + tif = TIF_NEED_RESCHED; + + if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED)) return; cpu = cpu_of(rq); if (cpu == smp_processor_id()) { - set_tsk_need_resched(curr); - set_preempt_need_resched(); + set_ti_thread_flag(cti, tif); + if (tif == TIF_NEED_RESCHED) + set_preempt_need_resched(); return; } - if (set_nr_and_not_polling(curr)) - smp_send_reschedule(cpu); - else + if (set_nr_and_not_polling(cti, tif)) { + if (tif == TIF_NEED_RESCHED) + smp_send_reschedule(cpu); + } else { trace_sched_wake_idle_without_ipi(cpu); + } +} + +void resched_curr(struct rq *rq) +{ + __resched_curr(rq, TIF_NEED_RESCHED); +} + +#ifdef CONFIG_PREEMPT_DYNAMIC +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy); +static __always_inline bool dynamic_preempt_lazy(void) +{ + return static_branch_unlikely(&sk_dynamic_preempt_lazy); +} +#else +static __always_inline bool dynamic_preempt_lazy(void) +{ + return IS_ENABLED(CONFIG_PREEMPT_LAZY); +} +#endif + +static __always_inline int get_lazy_tif_bit(void) +{ + if (dynamic_preempt_lazy()) + return TIF_NEED_RESCHED_LAZY; + + return TIF_NEED_RESCHED; +} + +void resched_curr_lazy(struct rq *rq) +{ + __resched_curr(rq, get_lazy_tif_bit()); } void resched_cpu(int cpu) @@ -1187,7 +1233,7 @@ static void wake_up_idle_cpu(int cpu) * and testing of the above solutions didn't appear to report * much benefits. */ - if (set_nr_and_not_polling(rq->idle)) + if (set_nr_and_not_polling(task_thread_info(rq->idle), TIF_NEED_RESCHED)) smp_send_reschedule(cpu); else trace_sched_wake_idle_without_ipi(cpu); @@ -1394,7 +1440,7 @@ void set_load_weight(struct task_struct *p, bool update_load) * requests are serialized using a mutex to reduce the risk of conflicting * updates or API abuses. */ -static DEFINE_MUTEX(uclamp_mutex); +static __maybe_unused DEFINE_MUTEX(uclamp_mutex); /* Max allowed minimum utilization */ static unsigned int __maybe_unused sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; @@ -2012,11 +2058,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & ENQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & ENQUEUE_RESTORE)) { - sched_info_enqueue(rq, p); - psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)); - } - p->sched_class->enqueue_task(rq, p, flags); /* * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear @@ -2024,6 +2065,11 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) */ uclamp_rq_inc(rq, p); + psi_enqueue(p, flags); + + if (!(flags & ENQUEUE_RESTORE)) + sched_info_enqueue(rq, p); + if (sched_core_enabled(rq)) sched_core_enqueue(rq, p); } @@ -2039,10 +2085,10 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & DEQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & DEQUEUE_SAVE)) { + if (!(flags & DEQUEUE_SAVE)) sched_info_dequeue(rq, p); - psi_dequeue(p, flags & DEQUEUE_SLEEP); - } + + psi_dequeue(p, flags); /* * Must be before ->dequeue_task() because ->dequeue_task() can 'fail' @@ -2130,16 +2176,18 @@ void check_class_changed(struct rq *rq, struct task_struct *p, void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) { - if (p->sched_class == rq->curr->sched_class) - rq->curr->sched_class->wakeup_preempt(rq, p, flags); - else if (sched_class_above(p->sched_class, rq->curr->sched_class)) + struct task_struct *donor = rq->donor; + + if (p->sched_class == donor->sched_class) + donor->sched_class->wakeup_preempt(rq, p, flags); + else if (sched_class_above(p->sched_class, donor->sched_class)) resched_curr(rq); /* * A queue event has occurred, and we're going to schedule. In * this case, we can save a useless back to back clock update. */ - if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) + if (task_on_rq_queued(donor) && test_tsk_need_resched(rq->curr)) rq_clock_skip_update(rq); } @@ -2615,9 +2663,7 @@ int push_cpu_stop(void *arg) // XXX validate p is still the highest prio task if (task_rq(p) == rq) { - deactivate_task(rq, p, 0); - set_task_cpu(p, lowest_rq->cpu); - activate_task(lowest_rq, p, 0); + move_queued_task_locked(rq, lowest_rq, p); resched_curr(lowest_rq); } @@ -2677,7 +2723,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) lockdep_assert_held(&p->pi_lock); queued = task_on_rq_queued(p); - running = task_current(rq, p); + running = task_current_donor(rq, p); if (queued) { /* @@ -2691,6 +2737,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) put_prev_task(rq, p); p->sched_class->set_cpus_allowed(p, ctx); + mm_set_cpus_allowed(p->mm, ctx->new_mask); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); @@ -3303,9 +3350,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) rq_pin_lock(src_rq, &srf); rq_pin_lock(dst_rq, &drf); - deactivate_task(src_rq, p, 0); - set_task_cpu(p, cpu); - activate_task(dst_rq, p, 0); + move_queued_task_locked(src_rq, dst_rq, p); wakeup_preempt(dst_rq, p, 0); rq_unpin_lock(dst_rq, &drf); @@ -3518,14 +3563,16 @@ out: * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. */ static inline -int select_task_rq(struct task_struct *p, int cpu, int wake_flags) +int select_task_rq(struct task_struct *p, int cpu, int *wake_flags) { lockdep_assert_held(&p->pi_lock); - if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) - cpu = p->sched_class->select_task_rq(p, cpu, wake_flags); - else + if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) { + cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags); + *wake_flags |= WF_RQ_SELECTED; + } else { cpu = cpumask_any(p->cpus_ptr); + } /* * In order not to call set_task_cpu() on a blocking task we need @@ -3659,6 +3706,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, rq->nr_uninterruptible--; #ifdef CONFIG_SMP + if (wake_flags & WF_RQ_SELECTED) + en_flags |= ENQUEUE_RQ_SELECTED; if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; else @@ -4120,6 +4169,8 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) guard(preempt)(); int cpu, success = 0; + wake_flags |= WF_TTWU; + if (p == current) { /* * We're waking current, this means 'p->on_rq' and 'task_cpu(p) @@ -4252,7 +4303,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_cond_load_acquire(&p->on_cpu, !VAL); - cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU); + cpu = select_task_rq(p, p->wake_cpu, &wake_flags); if (task_cpu(p) != cpu) { if (p->in_iowait) { delayacct_blkio_end(p); @@ -4317,9 +4368,10 @@ static bool __task_needs_rq_lock(struct task_struct *p) * @arg: Argument to function. * * Fix the task in it's current state by avoiding wakeups and or rq operations - * and call @func(@arg) on it. This function can use ->on_rq and task_curr() - * to work out what the state is, if required. Given that @func can be invoked - * with a runqueue lock held, it had better be quite lightweight. + * and call @func(@arg) on it. This function can use task_is_runnable() and + * task_curr() to work out what the state is, if required. Given that @func + * can be invoked with a runqueue lock held, it had better be quite + * lightweight. * * Returns: * Whatever @func returns @@ -4412,7 +4464,8 @@ int wake_up_state(struct task_struct *p, unsigned int state) * Perform scheduler related setup for a newly forked process p. * p is forked by current. * - * __sched_fork() is basic setup used by init_idle() too: + * __sched_fork() is basic setup which is also used by sched_init() to + * initialize the boot CPU's idle task. */ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) { @@ -4699,7 +4752,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) if (rt_prio(p->prio)) { p->sched_class = &rt_sched_class; #ifdef CONFIG_SCHED_CLASS_EXT - } else if (task_should_scx(p)) { + } else if (task_should_scx(p->policy)) { p->sched_class = &ext_sched_class; #endif } else { @@ -4793,6 +4846,7 @@ void wake_up_new_task(struct task_struct *p) { struct rq_flags rf; struct rq *rq; + int wake_flags = WF_FORK; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); WRITE_ONCE(p->__state, TASK_RUNNING); @@ -4807,7 +4861,7 @@ void wake_up_new_task(struct task_struct *p) */ p->recent_used_cpu = task_cpu(p); rseq_migrate(p); - __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK)); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); #endif rq = __task_rq_lock(p, &rf); update_rq_clock(rq); @@ -4815,7 +4869,7 @@ void wake_up_new_task(struct task_struct *p) activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL); trace_sched_wakeup_new(p); - wakeup_preempt(rq, p, WF_FORK); + wakeup_preempt(rq, p, wake_flags); #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* @@ -5504,7 +5558,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) * project cycles that may never be accounted to this * thread, breaking clock_gettime(). */ - if (task_current(rq, p) && task_on_rq_queued(p)) { + if (task_current_donor(rq, p) && task_on_rq_queued(p)) { prefetch_curr_exec_start(p); update_rq_clock(rq); p->sched_class->update_curr(rq); @@ -5572,7 +5626,8 @@ void sched_tick(void) { int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); - struct task_struct *curr; + /* accounting goes to the donor task */ + struct task_struct *donor; struct rq_flags rf; unsigned long hw_pressure; u64 resched_latency; @@ -5583,19 +5638,23 @@ void sched_tick(void) sched_clock_tick(); rq_lock(rq, &rf); + donor = rq->donor; - curr = rq->curr; - psi_account_irqtime(rq, curr, NULL); + psi_account_irqtime(rq, donor, NULL); update_rq_clock(rq); hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure); - curr->sched_class->task_tick(rq, curr, 0); + + if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY)) + resched_curr(rq); + + donor->sched_class->task_tick(rq, donor, 0); if (sched_feat(LATENCY_WARN)) resched_latency = cpu_resched_latency(rq); calc_global_load_tick(rq); sched_core_tick(rq); - task_tick_mm_cid(rq, curr); + task_tick_mm_cid(rq, donor); scx_tick(rq); rq_unlock(rq, &rf); @@ -5605,8 +5664,8 @@ void sched_tick(void) perf_event_task_tick(); - if (curr->flags & PF_WQ_WORKER) - wq_worker_tick(curr); + if (donor->flags & PF_WQ_WORKER) + wq_worker_tick(donor); #ifdef CONFIG_SMP if (!scx_switched_all()) { @@ -5673,6 +5732,12 @@ static void sched_tick_remote(struct work_struct *work) struct task_struct *curr = rq->curr; if (cpu_online(cpu)) { + /* + * Since this is a remote tick for full dynticks mode, + * we are always sure that there is no proxy (only a + * single task is running). + */ + SCHED_WARN_ON(rq->curr != rq->donor); update_rq_clock(rq); if (!is_idle_task(curr)) { @@ -5907,12 +5972,15 @@ static void prev_balance(struct rq *rq, struct task_struct *prev, #ifdef CONFIG_SCHED_CLASS_EXT /* - * SCX requires a balance() call before every pick_next_task() including - * when waking up from SCHED_IDLE. If @start_class is below SCX, start - * from SCX instead. + * SCX requires a balance() call before every pick_task() including when + * waking up from SCHED_IDLE. If @start_class is below SCX, start from + * SCX instead. Also, set a flag to detect missing balance() call. */ - if (scx_enabled() && sched_class_above(&ext_sched_class, start_class)) - start_class = &ext_sched_class; + if (scx_enabled()) { + rq->scx.flags |= SCX_RQ_BAL_PENDING; + if (sched_class_above(&ext_sched_class, start_class)) + start_class = &ext_sched_class; + } #endif /* @@ -6293,10 +6361,7 @@ static bool try_steal_cookie(int this, int that) if (sched_task_is_throttled(p, this)) goto next; - deactivate_task(src, p, 0); - set_task_cpu(p, this); - activate_task(dst, p, 0); - + move_queued_task_locked(src, dst, p); resched_curr(dst); success = true; @@ -6491,6 +6556,45 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) #define SM_RTLOCK_WAIT 2 /* + * Helper function for __schedule() + * + * If a task does not have signals pending, deactivate it + * Otherwise marks the task's __state as RUNNING + */ +static bool try_to_block_task(struct rq *rq, struct task_struct *p, + unsigned long task_state) +{ + int flags = DEQUEUE_NOCLOCK; + + if (signal_pending_state(task_state, p)) { + WRITE_ONCE(p->__state, TASK_RUNNING); + return false; + } + + p->sched_contributes_to_load = + (task_state & TASK_UNINTERRUPTIBLE) && + !(task_state & TASK_NOLOAD) && + !(task_state & TASK_FROZEN); + + if (unlikely(is_special_task_state(task_state))) + flags |= DEQUEUE_SPECIAL; + + /* + * __schedule() ttwu() + * prev_state = prev->state; if (p->on_rq && ...) + * if (prev_state) goto out; + * p->on_rq = 0; smp_acquire__after_ctrl_dep(); + * p->state = TASK_WAKING + * + * Where __schedule() and ttwu() have matching control dependencies. + * + * After this, schedule() must not care about p->state any more. + */ + block_task(rq, p, flags); + return true; +} + +/* * __schedule() is the main scheduler function. * * The main means of driving the scheduler and thus entering this function are: @@ -6537,6 +6641,7 @@ static void __sched notrace __schedule(int sched_mode) * as a preemption by schedule_debug() and RCU. */ bool preempt = sched_mode > SM_NONE; + bool block = false; unsigned long *switch_count; unsigned long prev_state; struct rq_flags rf; @@ -6591,41 +6696,18 @@ static void __sched notrace __schedule(int sched_mode) */ prev_state = READ_ONCE(prev->__state); if (sched_mode == SM_IDLE) { - if (!rq->nr_running) { + /* SCX must consult the BPF scheduler to tell if rq is empty */ + if (!rq->nr_running && !scx_enabled()) { next = prev; goto picked; } } else if (!preempt && prev_state) { - if (signal_pending_state(prev_state, prev)) { - WRITE_ONCE(prev->__state, TASK_RUNNING); - } else { - int flags = DEQUEUE_NOCLOCK; - - prev->sched_contributes_to_load = - (prev_state & TASK_UNINTERRUPTIBLE) && - !(prev_state & TASK_NOLOAD) && - !(prev_state & TASK_FROZEN); - - if (unlikely(is_special_task_state(prev_state))) - flags |= DEQUEUE_SPECIAL; - - /* - * __schedule() ttwu() - * prev_state = prev->state; if (p->on_rq && ...) - * if (prev_state) goto out; - * p->on_rq = 0; smp_acquire__after_ctrl_dep(); - * p->state = TASK_WAKING - * - * Where __schedule() and ttwu() have matching control dependencies. - * - * After this, schedule() must not care about p->state any more. - */ - block_task(rq, prev, flags); - } + block = try_to_block_task(rq, prev, prev_state); switch_count = &prev->nvcsw; } next = pick_next_task(rq, prev, &rf); + rq_set_donor(rq, next); picked: clear_tsk_need_resched(prev); clear_preempt_need_resched(); @@ -6666,7 +6748,7 @@ picked: migrate_disable_switch(rq, prev); psi_account_irqtime(rq, prev, next); - psi_sched_switch(prev, next, !task_on_rq_queued(prev)); + psi_sched_switch(prev, next, block); trace_sched_switch(preempt, prev, next, prev_state); @@ -7009,20 +7091,20 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag } EXPORT_SYMBOL(default_wake_function); -void __setscheduler_prio(struct task_struct *p, int prio) +const struct sched_class *__setscheduler_class(int policy, int prio) { if (dl_prio(prio)) - p->sched_class = &dl_sched_class; - else if (rt_prio(prio)) - p->sched_class = &rt_sched_class; + return &dl_sched_class; + + if (rt_prio(prio)) + return &rt_sched_class; + #ifdef CONFIG_SCHED_CLASS_EXT - else if (task_should_scx(p)) - p->sched_class = &ext_sched_class; + if (task_should_scx(policy)) + return &ext_sched_class; #endif - else - p->sched_class = &fair_sched_class; - p->prio = prio; + return &fair_sched_class; } #ifdef CONFIG_RT_MUTEXES @@ -7068,7 +7150,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) { int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; - const struct sched_class *prev_class; + const struct sched_class *prev_class, *next_class; struct rq_flags rf; struct rq *rq; @@ -7126,8 +7208,13 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) queue_flag &= ~DEQUEUE_MOVE; prev_class = p->sched_class; + next_class = __setscheduler_class(p->policy, prio); + + if (prev_class != next_class && p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); + queued = task_on_rq_queued(p); - running = task_current(rq, p); + running = task_current_donor(rq, p); if (queued) dequeue_task(rq, p, queue_flag); if (running) @@ -7163,7 +7250,9 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) p->rt.timeout = 0; } - __setscheduler_prio(p, prio); + p->sched_class = next_class; + p->prio = prio; + check_class_changing(rq, p, prev_class); if (queued) @@ -7325,6 +7414,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); * preempt_schedule <- NOP * preempt_schedule_notrace <- NOP * irqentry_exit_cond_resched <- NOP + * dynamic_preempt_lazy <- false * * VOLUNTARY: * cond_resched <- __cond_resched @@ -7332,6 +7422,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); * preempt_schedule <- NOP * preempt_schedule_notrace <- NOP * irqentry_exit_cond_resched <- NOP + * dynamic_preempt_lazy <- false * * FULL: * cond_resched <- RET0 @@ -7339,6 +7430,15 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); * preempt_schedule <- preempt_schedule * preempt_schedule_notrace <- preempt_schedule_notrace * irqentry_exit_cond_resched <- irqentry_exit_cond_resched + * dynamic_preempt_lazy <- false + * + * LAZY: + * cond_resched <- RET0 + * might_resched <- RET0 + * preempt_schedule <- preempt_schedule + * preempt_schedule_notrace <- preempt_schedule_notrace + * irqentry_exit_cond_resched <- irqentry_exit_cond_resched + * dynamic_preempt_lazy <- true */ enum { @@ -7346,30 +7446,41 @@ enum { preempt_dynamic_none, preempt_dynamic_voluntary, preempt_dynamic_full, + preempt_dynamic_lazy, }; int preempt_dynamic_mode = preempt_dynamic_undefined; int sched_dynamic_mode(const char *str) { +#ifndef CONFIG_PREEMPT_RT if (!strcmp(str, "none")) return preempt_dynamic_none; if (!strcmp(str, "voluntary")) return preempt_dynamic_voluntary; +#endif if (!strcmp(str, "full")) return preempt_dynamic_full; +#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY + if (!strcmp(str, "lazy")) + return preempt_dynamic_lazy; +#endif + return -EINVAL; } +#define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key) +#define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key) + #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) #define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) #define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) -#define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key) -#define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key) +#define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f) +#define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f) #else #error "Unsupported PREEMPT_DYNAMIC mechanism" #endif @@ -7389,6 +7500,7 @@ static void __sched_dynamic_update(int mode) preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); preempt_dynamic_enable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); switch (mode) { case preempt_dynamic_none: @@ -7398,6 +7510,7 @@ static void __sched_dynamic_update(int mode) preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule_notrace); preempt_dynamic_disable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); if (mode != preempt_dynamic_mode) pr_info("Dynamic Preempt: none\n"); break; @@ -7409,6 +7522,7 @@ static void __sched_dynamic_update(int mode) preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule_notrace); preempt_dynamic_disable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); if (mode != preempt_dynamic_mode) pr_info("Dynamic Preempt: voluntary\n"); break; @@ -7420,9 +7534,22 @@ static void __sched_dynamic_update(int mode) preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); preempt_dynamic_enable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); if (mode != preempt_dynamic_mode) pr_info("Dynamic Preempt: full\n"); break; + + case preempt_dynamic_lazy: + if (!klp_override) + preempt_dynamic_disable(cond_resched); + preempt_dynamic_disable(might_resched); + preempt_dynamic_enable(preempt_schedule); + preempt_dynamic_enable(preempt_schedule_notrace); + preempt_dynamic_enable(irqentry_exit_cond_resched); + preempt_dynamic_key_enable(preempt_lazy); + if (mode != preempt_dynamic_mode) + pr_info("Dynamic Preempt: lazy\n"); + break; } preempt_dynamic_mode = mode; @@ -7485,6 +7612,8 @@ static void __init preempt_dynamic_init(void) sched_dynamic_update(preempt_dynamic_none); } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) { sched_dynamic_update(preempt_dynamic_voluntary); + } else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) { + sched_dynamic_update(preempt_dynamic_lazy); } else { /* Default static call setting, nothing to do */ WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)); @@ -7505,6 +7634,7 @@ static void __init preempt_dynamic_init(void) PREEMPT_MODEL_ACCESSOR(none); PREEMPT_MODEL_ACCESSOR(voluntary); PREEMPT_MODEL_ACCESSOR(full); +PREEMPT_MODEL_ACCESSOR(lazy); #else /* !CONFIG_PREEMPT_DYNAMIC: */ @@ -7657,8 +7787,6 @@ void __init init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - __sched_fork(0, idle); - raw_spin_lock_irqsave(&idle->pi_lock, flags); raw_spin_rq_lock(rq); @@ -7673,10 +7801,8 @@ void __init init_idle(struct task_struct *idle, int cpu) #ifdef CONFIG_SMP /* - * It's possible that init_idle() gets called multiple times on a task, - * in that case do_set_cpus_allowed() will not do the right thing. - * - * And since this is boot we can forgo the serialization. + * No validation and serialization required at boot time and for + * setting up the idle tasks of not yet online CPUs. */ set_cpus_allowed_common(idle, &ac); #endif @@ -7695,6 +7821,7 @@ void __init init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->idle = idle; + rq_set_donor(rq, idle); rcu_assign_pointer(rq->curr, idle); idle->on_rq = TASK_ON_RQ_QUEUED; #ifdef CONFIG_SMP @@ -7784,7 +7911,7 @@ void sched_setnuma(struct task_struct *p, int nid) rq = task_rq_lock(p, &rf); queued = task_on_rq_queued(p); - running = task_current(rq, p); + running = task_current_donor(rq, p); if (queued) dequeue_task(rq, p, DEQUEUE_SAVE); @@ -8520,6 +8647,7 @@ void __init sched_init(void) * but because we are the idle thread, we just pick up running again * when this runqueue becomes "idle". */ + __sched_fork(0, current); init_idle(current, smp_processor_id()); calc_load_update = jiffies + LOAD_FREQ; @@ -8934,7 +9062,7 @@ void sched_move_task(struct task_struct *tsk) update_rq_clock(rq); - running = task_current(rq, tsk); + running = task_current_donor(rq, tsk); queued = task_on_rq_queued(tsk); if (queued) @@ -10227,6 +10355,7 @@ int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq, */ if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) return -1; + WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET); return src_cid; } @@ -10239,7 +10368,8 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { struct mm_cid *src_pcpu_cid, *dst_pcpu_cid; struct mm_struct *mm = t->mm; - int src_cid, dst_cid, src_cpu; + int src_cid, src_cpu; + bool dst_cid_is_set; struct rq *src_rq; lockdep_assert_rq_held(dst_rq); @@ -10256,9 +10386,9 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) * allocation closest to 0 in cases where few threads migrate around * many CPUs. * - * If destination cid is already set, we may have to just clear - * the src cid to ensure compactness in frequent migrations - * scenarios. + * If destination cid or recent cid is already set, we may have + * to just clear the src cid to ensure compactness in frequent + * migrations scenarios. * * It is not useful to clear the src cid when the number of threads is * greater or equal to the number of allowed CPUs, because user-space @@ -10266,9 +10396,9 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) * allowed CPUs. */ dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq)); - dst_cid = READ_ONCE(dst_pcpu_cid->cid); - if (!mm_cid_is_unset(dst_cid) && - atomic_read(&mm->mm_users) >= t->nr_cpus_allowed) + dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) || + !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid)); + if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed)) return; src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu); src_rq = cpu_rq(src_cpu); @@ -10279,13 +10409,14 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) src_cid); if (src_cid == -1) return; - if (!mm_cid_is_unset(dst_cid)) { + if (dst_cid_is_set) { __mm_cid_put(mm, src_cid); return; } /* Move src_cid to dst cpu. */ mm_cid_snapshot_time(dst_rq, mm); WRITE_ONCE(dst_pcpu_cid->cid, src_cid); + WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid); } static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid, @@ -10457,7 +10588,9 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) return; if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) return; - task_work_add(curr, work, TWA_RESUME); + + /* No page allocation under rq lock */ + task_work_add(curr, work, TWA_RESUME | TWAF_NO_ALLOC); } void sched_mm_cid_exit_signals(struct task_struct *t) @@ -10522,7 +10655,7 @@ void sched_mm_cid_after_execve(struct task_struct *t) * Matches barrier in sched_mm_cid_remote_clear_old(). */ smp_mb(); - t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm); + t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm); } rseq_set_notify_resume(t); } diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index c6ba15388ea7..28c77904ea74 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -783,9 +783,8 @@ static int sugov_init(struct cpufreq_policy *policy) if (ret) goto fail; - sugov_eas_rebuild_sd(); - out: + sugov_eas_rebuild_sd(); mutex_unlock(&global_tunables_lock); return 0; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9ce93d0bf452..d9d5a702f1a6 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1339,7 +1339,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) #endif enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (dl_task(rq->curr)) + if (dl_task(rq->donor)) wakeup_preempt_dl(rq, p, 0); else resched_curr(rq); @@ -1736,11 +1736,11 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio */ static void update_curr_dl(struct rq *rq) { - struct task_struct *curr = rq->curr; - struct sched_dl_entity *dl_se = &curr->dl; + struct task_struct *donor = rq->donor; + struct sched_dl_entity *dl_se = &donor->dl; s64 delta_exec; - if (!dl_task(curr) || !on_dl_rq(dl_se)) + if (!dl_task(donor) || !on_dl_rq(dl_se)) return; /* @@ -2213,7 +2213,7 @@ static int find_later_rq(struct task_struct *task); static int select_task_rq_dl(struct task_struct *p, int cpu, int flags) { - struct task_struct *curr; + struct task_struct *curr, *donor; bool select_rq; struct rq *rq; @@ -2224,6 +2224,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) rcu_read_lock(); curr = READ_ONCE(rq->curr); /* unlocked access */ + donor = READ_ONCE(rq->donor); /* * If we are dealing with a -deadline task, we must @@ -2234,9 +2235,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) * other hand, if it has a shorter deadline, we * try to make it stay here, it might be important. */ - select_rq = unlikely(dl_task(curr)) && + select_rq = unlikely(dl_task(donor)) && (curr->nr_cpus_allowed < 2 || - !dl_entity_preempt(&p->dl, &curr->dl)) && + !dl_entity_preempt(&p->dl, &donor->dl)) && p->nr_cpus_allowed > 1; /* @@ -2299,7 +2300,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * let's hope p can move out. */ if (rq->curr->nr_cpus_allowed == 1 || - !cpudl_find(&rq->rd->cpudl, rq->curr, NULL)) + !cpudl_find(&rq->rd->cpudl, rq->donor, NULL)) return; /* @@ -2338,7 +2339,7 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags) { - if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { + if (dl_entity_preempt(&p->dl, &rq->donor->dl)) { resched_curr(rq); return; } @@ -2348,7 +2349,7 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, * In the unlikely case current and p have the same deadline * let us try to decide what's the best thing to do... */ - if ((p->dl.deadline == rq->curr->dl.deadline) && + if ((p->dl.deadline == rq->donor->dl.deadline) && !test_tsk_need_resched(rq->curr)) check_preempt_equal_dl(rq, p); #endif /* CONFIG_SMP */ @@ -2380,12 +2381,12 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) if (!first) return; - if (rq->curr->sched_class != &dl_sched_class) + if (rq->donor->sched_class != &dl_sched_class) update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); deadline_queue_push_tasks(rq); - if (hrtick_enabled(rq)) + if (hrtick_enabled_dl(rq)) start_hrtick_dl(rq, &p->dl); } @@ -2487,14 +2488,6 @@ static void task_fork_dl(struct task_struct *p) /* Only try algorithms three times */ #define DL_MAX_TRIES 3 -static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) -{ - if (!task_on_cpu(rq, p) && - cpumask_test_cpu(cpu, &p->cpus_mask)) - return 1; - return 0; -} - /* * Return the earliest pushable rq's task, which is suitable to be executed * on the CPU, NULL otherwise: @@ -2513,7 +2506,7 @@ next_node: if (next_node) { p = __node_2_pdl(next_node); - if (pick_dl_task(rq, p, cpu)) + if (task_is_pushable(rq, p, cpu)) return p; next_node = rb_next(next_node); @@ -2707,8 +2700,8 @@ retry: * can move away, it makes sense to just reschedule * without going further in pushing next_task. */ - if (dl_task(rq->curr) && - dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && + if (dl_task(rq->donor) && + dl_time_before(next_task->dl.deadline, rq->donor->dl.deadline) && rq->curr->nr_cpus_allowed > 1) { resched_curr(rq); return 0; @@ -2751,9 +2744,7 @@ retry: goto retry; } - deactivate_task(rq, next_task, 0); - set_task_cpu(next_task, later_rq->cpu); - activate_task(later_rq, next_task, 0); + move_queued_task_locked(rq, later_rq, next_task); ret = 1; resched_curr(later_rq); @@ -2833,15 +2824,13 @@ static void pull_dl_task(struct rq *this_rq) * deadline than the current task of its runqueue. */ if (dl_time_before(p->dl.deadline, - src_rq->curr->dl.deadline)) + src_rq->donor->dl.deadline)) goto skip; if (is_migration_disabled(p)) { push_task = get_push_task(src_rq); } else { - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); + move_queued_task_locked(src_rq, this_rq, p); dmin = p->dl.deadline; resched = true; } @@ -2874,9 +2863,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) if (!task_on_cpu(rq, p) && !test_tsk_need_resched(rq->curr) && p->nr_cpus_allowed > 1 && - dl_task(rq->curr) && + dl_task(rq->donor) && (rq->curr->nr_cpus_allowed < 2 || - !dl_entity_preempt(&p->dl, &rq->curr->dl))) { + !dl_entity_preempt(&p->dl, &rq->donor->dl))) { push_dl_tasks(rq); } } @@ -3051,12 +3040,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) return; } - if (rq->curr != p) { + if (rq->donor != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) deadline_queue_push_tasks(rq); #endif - if (dl_task(rq->curr)) + if (dl_task(rq->donor)) wakeup_preempt_dl(rq, p, 0); else resched_curr(rq); @@ -3085,7 +3074,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, if (!rq->dl.overloaded) deadline_queue_pull_task(rq); - if (task_current(rq, p)) { + if (task_current_donor(rq, p)) { /* * If we now have a earlier deadline task than p, * then reschedule, provided p is still on this diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index f4035c7a0fa1..a48b2a701ec2 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -245,11 +245,12 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, static int sched_dynamic_show(struct seq_file *m, void *v) { static const char * preempt_modes[] = { - "none", "voluntary", "full" + "none", "voluntary", "full", "lazy", }; - int i; + int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY); + int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2; - for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) { + for (; i < j; i++) { if (preempt_dynamic_mode == i) seq_puts(m, "("); seq_puts(m, preempt_modes[i]); diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 9ee5a9a261cc..7fff1d045477 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -18,6 +18,12 @@ enum scx_consts { SCX_EXIT_DUMP_DFL_LEN = 32768, SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, + + /* + * Iterating all tasks may take a while. Periodically drop + * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. + */ + SCX_OPS_TASK_ITER_BATCH = 32, }; enum scx_exit_kind { @@ -193,8 +199,10 @@ struct scx_dump_ctx { /** * struct sched_ext_ops - Operation table for BPF scheduler implementation * - * Userland can implement an arbitrary scheduling policy by implementing and - * loading operations in this table. + * A BPF scheduler can implement an arbitrary scheduling policy by + * implementing and loading operations in this table. Note that a userland + * scheduling policy can also be implemented using the BPF scheduler + * as a shim layer. */ struct sched_ext_ops { /** @@ -212,10 +220,15 @@ struct sched_ext_ops { * dispatch. While an explicit custom mechanism can be added, * select_cpu() serves as the default way to wake up idle CPUs. * - * @p may be dispatched directly by calling scx_bpf_dispatch(). If @p - * is dispatched, the ops.enqueue() callback will be skipped. Finally, - * if @p is dispatched to SCX_DSQ_LOCAL, it will be dispatched to the - * local DSQ of whatever CPU is returned by this callback. + * @p may be inserted into a DSQ directly by calling + * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped. + * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ + * of the CPU returned by this operation. + * + * Note that select_cpu() is never called for tasks that can only run + * on a single CPU or tasks with migration disabled, as they don't have + * the option to select a different CPU. See select_task_rq() for + * details. */ s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); @@ -224,12 +237,12 @@ struct sched_ext_ops { * @p: task being enqueued * @enq_flags: %SCX_ENQ_* * - * @p is ready to run. Dispatch directly by calling scx_bpf_dispatch() - * or enqueue on the BPF scheduler. If not directly dispatched, the bpf - * scheduler owns @p and if it fails to dispatch @p, the task will - * stall. + * @p is ready to run. Insert directly into a DSQ by calling + * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly + * inserted, the bpf scheduler owns @p and if it fails to dispatch @p, + * the task will stall. * - * If @p was dispatched from ops.select_cpu(), this callback is + * If @p was inserted into a DSQ from ops.select_cpu(), this callback is * skipped. */ void (*enqueue)(struct task_struct *p, u64 enq_flags); @@ -251,17 +264,17 @@ struct sched_ext_ops { void (*dequeue)(struct task_struct *p, u64 deq_flags); /** - * dispatch - Dispatch tasks from the BPF scheduler and/or consume DSQs + * dispatch - Dispatch tasks from the BPF scheduler and/or user DSQs * @cpu: CPU to dispatch tasks for * @prev: previous task being switched out * * Called when a CPU's local dsq is empty. The operation should dispatch * one or more tasks from the BPF scheduler into the DSQs using - * scx_bpf_dispatch() and/or consume user DSQs into the local DSQ using - * scx_bpf_consume(). + * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ + * using scx_bpf_dsq_move_to_local(). * - * The maximum number of times scx_bpf_dispatch() can be called without - * an intervening scx_bpf_consume() is specified by + * The maximum number of times scx_bpf_dsq_insert() can be called + * without an intervening scx_bpf_dsq_move_to_local() is specified by * ops.dispatch_max_batch. See the comments on top of the two functions * for more details. * @@ -269,7 +282,7 @@ struct sched_ext_ops { * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in * @prev->scx.flags, it is not enqueued yet and will be enqueued after * ops.dispatch() returns. To keep executing @prev, return without - * dispatching or consuming any tasks. Also see %SCX_OPS_ENQ_LAST. + * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST. */ void (*dispatch)(s32 cpu, struct task_struct *prev); @@ -588,7 +601,7 @@ struct sched_ext_ops { * Update @tg's weight to @weight. */ void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); -#endif /* CONFIG_CGROUPS */ +#endif /* CONFIG_EXT_GROUP_SCHED */ /* * All online ops must come before ops.cpu_online(). @@ -624,6 +637,10 @@ struct sched_ext_ops { /** * exit - Clean up after the BPF scheduler * @info: Exit info + * + * ops.exit() is also called on ops.init() failure, which is a bit + * unusual. This is to allow rich reporting through @info on how + * ops.init() failed. */ void (*exit)(struct scx_exit_info *info); @@ -691,12 +708,13 @@ enum scx_enq_flags { /* expose select ENQUEUE_* flags as enums */ SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, SCX_ENQ_HEAD = ENQUEUE_HEAD, + SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, /* high 32bits are SCX specific */ /* * Set the following to trigger preemption when calling - * scx_bpf_dispatch() with a local dsq as the target. The slice of the + * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the * current task is cleared to zero and the CPU is kicked into the * scheduling path. Implies %SCX_ENQ_HEAD. */ @@ -778,7 +796,6 @@ enum scx_tg_flags { }; enum scx_ops_enable_state { - SCX_OPS_PREPPING, SCX_OPS_ENABLING, SCX_OPS_ENABLED, SCX_OPS_DISABLING, @@ -786,7 +803,6 @@ enum scx_ops_enable_state { }; static const char *scx_ops_enable_state_str[] = { - [SCX_OPS_PREPPING] = "prepping", [SCX_OPS_ENABLING] = "enabling", [SCX_OPS_ENABLED] = "enabled", [SCX_OPS_DISABLING] = "disabling", @@ -853,7 +869,10 @@ static DEFINE_MUTEX(scx_ops_enable_mutex); DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); -static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0); +static unsigned long scx_in_softlockup; +static atomic_t scx_ops_breather_depth = ATOMIC_INIT(0); +static int scx_ops_bypass_depth; +static bool scx_ops_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); @@ -865,6 +884,11 @@ static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); +#ifdef CONFIG_SMP +static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); +static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa); +#endif + static struct static_key_false scx_has_op[SCX_OPI_END] = { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT }; @@ -875,6 +899,13 @@ static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); /* + * A monotically increasing sequence number that is incremented every time a + * scheduler is enabled. This can be used by to check if any custom sched_ext + * scheduler has ever been used in the system. + */ +static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0); + +/* * The maximum amount of time in jiffies that a task may be runnable without * being scheduled on a CPU. If this timeout is exceeded, it will trigger * scx_ops_error(). @@ -918,8 +949,15 @@ static unsigned long __percpu *scx_kick_cpus_pnt_seqs; */ static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); -/* dispatch queues */ -static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global; +/* + * Dispatch queues. + * + * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is + * to avoid live-locking in bypass mode where all tasks are dispatched to + * %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't + * sufficient, it can be further split. + */ +static struct scx_dispatch_q **global_dsqs; static const struct rhashtable_params dsq_hash_params = { .key_len = 8, @@ -1022,6 +1060,16 @@ static bool u32_before(u32 a, u32 b) return (s32)(a - b) < 0; } +static struct scx_dispatch_q *find_global_dsq(struct task_struct *p) +{ + return global_dsqs[cpu_to_node(task_cpu(p))]; +} + +static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) +{ + return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); +} + /* * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate @@ -1245,86 +1293,104 @@ struct scx_task_iter { struct task_struct *locked; struct rq *rq; struct rq_flags rf; + u32 cnt; }; /** - * scx_task_iter_init - Initialize a task iterator + * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration * @iter: iterator to init * - * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized, - * @iter must eventually be exited with scx_task_iter_exit(). + * Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter + * must eventually be stopped with scx_task_iter_stop(). * - * scx_tasks_lock may be released between this and the first next() call or - * between any two next() calls. If scx_tasks_lock is released between two - * next() calls, the caller is responsible for ensuring that the task being - * iterated remains accessible either through RCU read lock or obtaining a - * reference count. + * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock() + * between this and the first next() call or between any two next() calls. If + * the locks are released between two next() calls, the caller is responsible + * for ensuring that the task being iterated remains accessible either through + * RCU read lock or obtaining a reference count. * * All tasks which existed when the iteration started are guaranteed to be * visited as long as they still exist. */ -static void scx_task_iter_init(struct scx_task_iter *iter) +static void scx_task_iter_start(struct scx_task_iter *iter) { - lockdep_assert_held(&scx_tasks_lock); - BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); + spin_lock_irq(&scx_tasks_lock); + iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; list_add(&iter->cursor.tasks_node, &scx_tasks); iter->locked = NULL; + iter->cnt = 0; +} + +static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) +{ + if (iter->locked) { + task_rq_unlock(iter->rq, iter->locked, &iter->rf); + iter->locked = NULL; + } } /** - * scx_task_iter_rq_unlock - Unlock rq locked by a task iterator - * @iter: iterator to unlock rq for + * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator + * @iter: iterator to unlock * * If @iter is in the middle of a locked iteration, it may be locking the rq of - * the task currently being visited. Unlock the rq if so. This function can be - * safely called anytime during an iteration. + * the task currently being visited in addition to scx_tasks_lock. Unlock both. + * This function can be safely called anytime during an iteration. + */ +static void scx_task_iter_unlock(struct scx_task_iter *iter) +{ + __scx_task_iter_rq_unlock(iter); + spin_unlock_irq(&scx_tasks_lock); +} + +/** + * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock() + * @iter: iterator to re-lock * - * Returns %true if the rq @iter was locking is unlocked. %false if @iter was - * not locking an rq. + * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it + * doesn't re-lock the rq lock. Must be called before other iterator operations. */ -static bool scx_task_iter_rq_unlock(struct scx_task_iter *iter) +static void scx_task_iter_relock(struct scx_task_iter *iter) { - if (iter->locked) { - task_rq_unlock(iter->rq, iter->locked, &iter->rf); - iter->locked = NULL; - return true; - } else { - return false; - } + spin_lock_irq(&scx_tasks_lock); } /** - * scx_task_iter_exit - Exit a task iterator + * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock * @iter: iterator to exit * - * Exit a previously initialized @iter. Must be called with scx_tasks_lock held. - * If the iterator holds a task's rq lock, that rq lock is released. See - * scx_task_iter_init() for details. + * Exit a previously initialized @iter. Must be called with scx_tasks_lock held + * which is released on return. If the iterator holds a task's rq lock, that rq + * lock is also released. See scx_task_iter_start() for details. */ -static void scx_task_iter_exit(struct scx_task_iter *iter) +static void scx_task_iter_stop(struct scx_task_iter *iter) { - lockdep_assert_held(&scx_tasks_lock); - - scx_task_iter_rq_unlock(iter); list_del_init(&iter->cursor.tasks_node); + scx_task_iter_unlock(iter); } /** * scx_task_iter_next - Next task * @iter: iterator to walk * - * Visit the next task. See scx_task_iter_init() for details. + * Visit the next task. See scx_task_iter_start() for details. Locks are dropped + * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing + * stalls by holding scx_tasks_lock for too long. */ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) { struct list_head *cursor = &iter->cursor.tasks_node; struct sched_ext_entity *pos; - lockdep_assert_held(&scx_tasks_lock); + if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) { + scx_task_iter_unlock(iter); + cond_resched(); + scx_task_iter_relock(iter); + } list_for_each_entry(pos, cursor, tasks_node) { if (&pos->tasks_node == &scx_tasks) @@ -1345,14 +1411,14 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) * @include_dead: Whether we should include dead tasks in the iteration * * Visit the non-idle task with its rq lock held. Allows callers to specify - * whether they would like to filter out dead tasks. See scx_task_iter_init() + * whether they would like to filter out dead tasks. See scx_task_iter_start() * for details. */ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) { struct task_struct *p; - scx_task_iter_rq_unlock(iter); + __scx_task_iter_rq_unlock(iter); while ((p = scx_task_iter_next(iter))) { /* @@ -1630,7 +1696,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, scx_ops_error("attempting to dispatch to a destroyed dsq"); /* fall back to the global dsq */ raw_spin_unlock(&dsq->lock); - dsq = &scx_dsq_global; + dsq = find_global_dsq(p); raw_spin_lock(&dsq->lock); } } @@ -1796,21 +1862,6 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p) raw_spin_unlock(&dsq->lock); } -static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) -{ - return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); -} - -static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id) -{ - lockdep_assert(rcu_read_lock_any_held()); - - if (dsq_id == SCX_DSQ_GLOBAL) - return &scx_dsq_global; - else - return find_user_dsq(dsq_id); -} - static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, struct task_struct *p) { @@ -1823,16 +1874,20 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) - return &scx_dsq_global; + return find_global_dsq(p); return &cpu_rq(cpu)->scx.local_dsq; } - dsq = find_non_local_dsq(dsq_id); + if (dsq_id == SCX_DSQ_GLOBAL) + dsq = find_global_dsq(p); + else + dsq = find_user_dsq(dsq_id); + if (unlikely(!dsq)) { scx_ops_error("non-existent DSQ 0x%llx for %s[%d]", dsq_id, p->comm, p->pid); - return &scx_dsq_global; + return find_global_dsq(p); } return dsq; @@ -2004,7 +2059,7 @@ local_norefill: global: touch_core_sched(rq, p); /* see the comment in local: */ p->scx.slice = SCX_SLICE_DFL; - dispatch_enqueue(&scx_dsq_global, p, enq_flags); + dispatch_enqueue(find_global_dsq(p), p, enq_flags); } static bool task_runnable(const struct task_struct *p) @@ -2267,7 +2322,7 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, /* * We don't require the BPF scheduler to avoid dispatching to offline * CPUs mostly for convenience but also because CPUs can go offline - * between scx_bpf_dispatch() calls and here. Trigger error iff the + * between scx_bpf_dsq_insert() calls and here. Trigger error iff the * picked CPU is outside the allowed mask. */ if (!task_allowed_on_cpu(p, cpu)) { @@ -2350,15 +2405,120 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p, } } #else /* CONFIG_SMP */ +static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); } static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; } static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; } #endif /* CONFIG_SMP */ +/** + * move_task_between_dsqs() - Move a task from one DSQ to another + * @p: target task + * @enq_flags: %SCX_ENQ_* + * @src_dsq: DSQ @p is currently on, must not be a local DSQ + * @dst_dsq: DSQ @p is being moved to, can be any DSQ + * + * Must be called with @p's task_rq and @src_dsq locked. If @dst_dsq is a local + * DSQ and @p is on a different CPU, @p will be migrated and thus its task_rq + * will change. As @p's task_rq is locked, this function doesn't need to use the + * holding_cpu mechanism. + * + * On return, @src_dsq is unlocked and only @p's new task_rq, which is the + * return value, is locked. + */ +static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, + struct scx_dispatch_q *src_dsq, + struct scx_dispatch_q *dst_dsq) +{ + struct rq *src_rq = task_rq(p), *dst_rq; + + BUG_ON(src_dsq->id == SCX_DSQ_LOCAL); + lockdep_assert_held(&src_dsq->lock); + lockdep_assert_rq_held(src_rq); + + if (dst_dsq->id == SCX_DSQ_LOCAL) { + dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); + if (!task_can_run_on_remote_rq(p, dst_rq, true)) { + dst_dsq = find_global_dsq(p); + dst_rq = src_rq; + } + } else { + /* no need to migrate if destination is a non-local DSQ */ + dst_rq = src_rq; + } + + /* + * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different + * CPU, @p will be migrated. + */ + if (dst_dsq->id == SCX_DSQ_LOCAL) { + /* @p is going from a non-local DSQ to a local DSQ */ + if (src_rq == dst_rq) { + task_unlink_from_dsq(p, src_dsq); + move_local_task_to_local_dsq(p, enq_flags, + src_dsq, dst_rq); + raw_spin_unlock(&src_dsq->lock); + } else { + raw_spin_unlock(&src_dsq->lock); + move_remote_task_to_local_dsq(p, enq_flags, + src_rq, dst_rq); + } + } else { + /* + * @p is going from a non-local DSQ to a non-local DSQ. As + * $src_dsq is already locked, do an abbreviated dequeue. + */ + task_unlink_from_dsq(p, src_dsq); + p->scx.dsq = NULL; + raw_spin_unlock(&src_dsq->lock); + + dispatch_enqueue(dst_dsq, p, enq_flags); + } + + return dst_rq; +} + +/* + * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly + * banging on the same DSQ on a large NUMA system to the point where switching + * to the bypass mode can take a long time. Inject artifical delays while the + * bypass mode is switching to guarantee timely completion. + */ +static void scx_ops_breather(struct rq *rq) +{ + u64 until; + + lockdep_assert_rq_held(rq); + + if (likely(!atomic_read(&scx_ops_breather_depth))) + return; + + raw_spin_rq_unlock(rq); + + until = ktime_get_ns() + NSEC_PER_MSEC; + + do { + int cnt = 1024; + while (atomic_read(&scx_ops_breather_depth) && --cnt) + cpu_relax(); + } while (atomic_read(&scx_ops_breather_depth) && + time_before64(ktime_get_ns(), until)); + + raw_spin_rq_lock(rq); +} + static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq) { struct task_struct *p; retry: /* + * This retry loop can repeatedly race against scx_ops_bypass() + * dequeueing tasks from @dsq trying to put the system into the bypass + * mode. On some multi-socket machines (e.g. 2x Intel 8480c), this can + * live-lock the machine into soft lockups. Give a breather. + */ + scx_ops_breather(rq); + + /* * The caller can't expect to successfully consume a task if the task's * addition to @dsq isn't guaranteed to be visible somehow. Test * @dsq->list without locking and skip if it seems empty. @@ -2389,6 +2549,13 @@ retry: return false; } +static bool consume_global_dsq(struct rq *rq) +{ + int node = cpu_to_node(cpu_of(rq)); + + return consume_dispatch_q(rq, global_dsqs[node]); +} + /** * dispatch_to_local_dsq - Dispatch a task to a local dsq * @rq: current rq which is locked @@ -2422,7 +2589,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, #ifdef CONFIG_SMP if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { - dispatch_enqueue(&scx_dsq_global, p, enq_flags | SCX_ENQ_CLEAR_OPSS); + dispatch_enqueue(find_global_dsq(p), p, + enq_flags | SCX_ENQ_CLEAR_OPSS); return; } @@ -2490,7 +2658,7 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, * Dispatching to local DSQs may need to wait for queueing to complete or * require rq lock dancing. As we don't wanna do either while inside * ops.dispatch() to avoid locking order inversion, we split dispatching into - * two parts. scx_bpf_dispatch() which is called by ops.dispatch() records the + * two parts. scx_bpf_dsq_insert() which is called by ops.dispatch() records the * task and its qseq. Once ops.dispatch() returns, this function is called to * finish up. * @@ -2522,7 +2690,7 @@ retry: /* * If qseq doesn't match, @p has gone through at least one * dispatch/dequeue and re-enqueue cycle between - * scx_bpf_dispatch() and here and we have no claim on it. + * scx_bpf_dsq_insert() and here and we have no claim on it. */ if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch) return; @@ -2583,7 +2751,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) lockdep_assert_rq_held(rq); rq->scx.flags |= SCX_RQ_IN_BALANCE; - rq->scx.flags &= ~SCX_RQ_BAL_KEEP; + rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP); if (static_branch_unlikely(&scx_ops_cpu_preempt) && unlikely(rq->scx.cpu_released)) { @@ -2591,10 +2759,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * If the previous sched_class for the current CPU was not SCX, * notify the BPF scheduler that it again has control of the * core. This callback complements ->cpu_release(), which is - * emitted in scx_next_task_picked(). + * emitted in switch_class(). */ if (SCX_HAS_OP(cpu_acquire)) - SCX_CALL_OP(0, cpu_acquire, cpu_of(rq), NULL); + SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL); rq->scx.cpu_released = false; } @@ -2622,7 +2790,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) if (rq->scx.local_dsq.nr) goto has_tasks; - if (consume_dispatch_q(rq, &scx_dsq_global)) + if (consume_global_dsq(rq)) goto has_tasks; if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq)) @@ -2647,7 +2815,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) if (rq->scx.local_dsq.nr) goto has_tasks; - if (consume_dispatch_q(rq, &scx_dsq_global)) + if (consume_global_dsq(rq)) goto has_tasks; /* @@ -2897,12 +3065,11 @@ static struct task_struct *pick_task_scx(struct rq *rq) { struct task_struct *prev = rq->curr; struct task_struct *p; + bool prev_on_scx = prev->sched_class == &ext_sched_class; + bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; + bool kick_idle = false; /* - * If balance_scx() is telling us to keep running @prev, replenish slice - * if necessary and keep running @prev. Otherwise, pop the first one - * from the local DSQ. - * * WORKAROUND: * * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just @@ -2911,27 +3078,46 @@ static struct task_struct *pick_task_scx(struct rq *rq) * which then ends up calling pick_task_scx() without preceding * balance_scx(). * - * For now, ignore cases where $prev is not on SCX. This isn't great and - * can theoretically lead to stalls. However, for switch_all cases, this - * happens only while a BPF scheduler is being loaded or unloaded, and, - * for partial cases, fair will likely keep triggering this CPU. + * Keep running @prev if possible and avoid stalling from entering idle + * without balancing. * - * Once fair is fixed, restore WARN_ON_ONCE(). + * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE() + * if pick_task_scx() is called without preceding balance_scx(). + */ + if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) { + if (prev_on_scx) { + keep_prev = true; + } else { + keep_prev = false; + kick_idle = true; + } + } else if (unlikely(keep_prev && !prev_on_scx)) { + /* only allowed during transitions */ + WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED); + keep_prev = false; + } + + /* + * If balance_scx() is telling us to keep running @prev, replenish slice + * if necessary and keep running @prev. Otherwise, pop the first one + * from the local DSQ. */ - if ((rq->scx.flags & SCX_RQ_BAL_KEEP) && - prev->sched_class == &ext_sched_class) { + if (keep_prev) { p = prev; if (!p->scx.slice) p->scx.slice = SCX_SLICE_DFL; } else { p = first_local_task(rq); - if (!p) + if (!p) { + if (kick_idle) + scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE); return NULL; + } if (unlikely(!p->scx.slice)) { if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) { - printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n", - p->comm, p->pid); + printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", + p->comm, p->pid, __func__); scx_warned_zero_slice = true; } p->scx.slice = SCX_SLICE_DFL; @@ -3029,41 +3215,215 @@ found: goto retry; } +/* + * Return true if the LLC domains do not perfectly overlap with the NUMA + * domains, false otherwise. + */ +static bool llc_numa_mismatch(void) +{ + int cpu; + + /* + * We need to scan all online CPUs to verify whether their scheduling + * domains overlap. + * + * While it is rare to encounter architectures with asymmetric NUMA + * topologies, CPU hotplugging or virtualized environments can result + * in asymmetric configurations. + * + * For example: + * + * NUMA 0: + * - LLC 0: cpu0..cpu7 + * - LLC 1: cpu8..cpu15 [offline] + * + * NUMA 1: + * - LLC 0: cpu16..cpu23 + * - LLC 1: cpu24..cpu31 + * + * In this case, if we only check the first online CPU (cpu0), we might + * incorrectly assume that the LLC and NUMA domains are fully + * overlapping, which is incorrect (as NUMA 1 has two distinct LLC + * domains). + */ + for_each_online_cpu(cpu) { + const struct cpumask *numa_cpus; + struct sched_domain *sd; + + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + if (!sd) + return true; + + numa_cpus = cpumask_of_node(cpu_to_node(cpu)); + if (sd->span_weight != cpumask_weight(numa_cpus)) + return true; + } + + return false; +} + +/* + * Initialize topology-aware scheduling. + * + * Detect if the system has multiple LLC or multiple NUMA domains and enable + * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle + * selection policy. + * + * Assumption: the kernel's internal topology representation assumes that each + * CPU belongs to a single LLC domain, and that each LLC domain is entirely + * contained within a single NUMA node. + */ +static void update_selcpu_topology(void) +{ + bool enable_llc = false, enable_numa = false; + struct sched_domain *sd; + const struct cpumask *cpus; + s32 cpu = cpumask_first(cpu_online_mask); + + /* + * Enable LLC domain optimization only when there are multiple LLC + * domains among the online CPUs. If all online CPUs are part of a + * single LLC domain, the idle CPU selection logic can choose any + * online CPU without bias. + * + * Note that it is sufficient to check the LLC domain of the first + * online CPU to determine whether a single LLC domain includes all + * CPUs. + */ + rcu_read_lock(); + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + if (sd) { + if (sd->span_weight < num_online_cpus()) + enable_llc = true; + } + + /* + * Enable NUMA optimization only when there are multiple NUMA domains + * among the online CPUs and the NUMA domains don't perfectly overlaps + * with the LLC domains. + * + * If all CPUs belong to the same NUMA node and the same LLC domain, + * enabling both NUMA and LLC optimizations is unnecessary, as checking + * for an idle CPU in the same domain twice is redundant. + */ + cpus = cpumask_of_node(cpu_to_node(cpu)); + if ((cpumask_weight(cpus) < num_online_cpus()) && llc_numa_mismatch()) + enable_numa = true; + rcu_read_unlock(); + + pr_debug("sched_ext: LLC idle selection %s\n", + enable_llc ? "enabled" : "disabled"); + pr_debug("sched_ext: NUMA idle selection %s\n", + enable_numa ? "enabled" : "disabled"); + + if (enable_llc) + static_branch_enable_cpuslocked(&scx_selcpu_topo_llc); + else + static_branch_disable_cpuslocked(&scx_selcpu_topo_llc); + if (enable_numa) + static_branch_enable_cpuslocked(&scx_selcpu_topo_numa); + else + static_branch_disable_cpuslocked(&scx_selcpu_topo_numa); +} + +/* + * Built-in CPU idle selection policy: + * + * 1. Prioritize full-idle cores: + * - always prioritize CPUs from fully idle cores (both logical CPUs are + * idle) to avoid interference caused by SMT. + * + * 2. Reuse the same CPU: + * - prefer the last used CPU to take advantage of cached data (L1, L2) and + * branch prediction optimizations. + * + * 3. Pick a CPU within the same LLC (Last-Level Cache): + * - if the above conditions aren't met, pick a CPU that shares the same LLC + * to maintain cache locality. + * + * 4. Pick a CPU within the same NUMA node, if enabled: + * - choose a CPU from the same NUMA node to reduce memory access latency. + * + * Step 3 and 4 are performed only if the system has, respectively, multiple + * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and + * scx_selcpu_topo_numa). + * + * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because + * we never call ops.select_cpu() for them, see select_task_rq(). + */ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *found) { + const struct cpumask *llc_cpus = NULL; + const struct cpumask *numa_cpus = NULL; s32 cpu; *found = false; - if (!static_branch_likely(&scx_builtin_idle_enabled)) { - scx_ops_error("built-in idle tracking is disabled"); - return prev_cpu; - } /* - * If WAKE_SYNC, the waker's local DSQ is empty, and the system is - * under utilized, wake up @p to the local DSQ of the waker. Checking - * only for an empty local DSQ is insufficient as it could give the - * wakee an unfair advantage when the system is oversaturated. - * Checking only for the presence of idle CPUs is also insufficient as - * the local DSQ of the waker could have tasks piled up on it even if - * there is an idle core elsewhere on the system. - */ - cpu = smp_processor_id(); - if ((wake_flags & SCX_WAKE_SYNC) && p->nr_cpus_allowed > 1 && - !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING) && - cpu_rq(cpu)->scx.local_dsq.nr == 0) { - if (cpumask_test_cpu(cpu, p->cpus_ptr)) - goto cpu_found; + * This is necessary to protect llc_cpus. + */ + rcu_read_lock(); + + /* + * Determine the scheduling domain only if the task is allowed to run + * on all CPUs. + * + * This is done primarily for efficiency, as it avoids the overhead of + * updating a cpumask every time we need to select an idle CPU (which + * can be costly in large SMP systems), but it also aligns logically: + * if a task's scheduling domain is restricted by user-space (through + * CPU affinity), the task will simply use the flat scheduling domain + * defined by user-space. + */ + if (p->nr_cpus_allowed >= num_possible_cpus()) { + if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) + numa_cpus = cpumask_of_node(cpu_to_node(prev_cpu)); + + if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) { + struct sched_domain *sd; + + sd = rcu_dereference(per_cpu(sd_llc, prev_cpu)); + if (sd) + llc_cpus = sched_domain_span(sd); + } } - if (p->nr_cpus_allowed == 1) { - if (test_and_clear_cpu_idle(prev_cpu)) { + /* + * If WAKE_SYNC, try to migrate the wakee to the waker's CPU. + */ + if (wake_flags & SCX_WAKE_SYNC) { + cpu = smp_processor_id(); + + /* + * If the waker's CPU is cache affine and prev_cpu is idle, + * then avoid a migration. + */ + if (cpus_share_cache(cpu, prev_cpu) && + test_and_clear_cpu_idle(prev_cpu)) { cpu = prev_cpu; goto cpu_found; - } else { - return prev_cpu; + } + + /* + * If the waker's local DSQ is empty, and the system is under + * utilized, try to wake up @p to the local DSQ of the waker. + * + * Checking only for an empty local DSQ is insufficient as it + * could give the wakee an unfair advantage when the system is + * oversaturated. + * + * Checking only for the presence of idle CPUs is also + * insufficient as the local DSQ of the waker could have tasks + * piled up on it even if there is an idle core elsewhere on + * the system. + */ + if (!cpumask_empty(idle_masks.cpu) && + !(current->flags & PF_EXITING) && + cpu_rq(cpu)->scx.local_dsq.nr == 0) { + if (cpumask_test_cpu(cpu, p->cpus_ptr)) + goto cpu_found; } } @@ -3072,29 +3432,80 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, * partially idle @prev_cpu. */ if (sched_smt_active()) { + /* + * Keep using @prev_cpu if it's part of a fully idle core. + */ if (cpumask_test_cpu(prev_cpu, idle_masks.smt) && test_and_clear_cpu_idle(prev_cpu)) { cpu = prev_cpu; goto cpu_found; } + /* + * Search for any fully idle core in the same LLC domain. + */ + if (llc_cpus) { + cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto cpu_found; + } + + /* + * Search for any fully idle core in the same NUMA node. + */ + if (numa_cpus) { + cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto cpu_found; + } + + /* + * Search for any full idle core usable by the task. + */ cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE); if (cpu >= 0) goto cpu_found; } + /* + * Use @prev_cpu if it's idle. + */ if (test_and_clear_cpu_idle(prev_cpu)) { cpu = prev_cpu; goto cpu_found; } + /* + * Search for any idle CPU in the same LLC domain. + */ + if (llc_cpus) { + cpu = scx_pick_idle_cpu(llc_cpus, 0); + if (cpu >= 0) + goto cpu_found; + } + + /* + * Search for any idle CPU in the same NUMA node. + */ + if (numa_cpus) { + cpu = scx_pick_idle_cpu(numa_cpus, 0); + if (cpu >= 0) + goto cpu_found; + } + + /* + * Search for any idle CPU usable by the task. + */ cpu = scx_pick_idle_cpu(p->cpus_ptr, 0); if (cpu >= 0) goto cpu_found; + rcu_read_unlock(); return prev_cpu; cpu_found: + rcu_read_unlock(); + *found = true; return cpu; } @@ -3114,7 +3525,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag if (unlikely(wake_flags & WF_EXEC)) return prev_cpu; - if (SCX_HAS_OP(select_cpu)) { + if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) { s32 cpu; struct task_struct **ddsp_taskp; @@ -3179,7 +3590,7 @@ void __scx_update_idle(struct rq *rq, bool idle) { int cpu = cpu_of(rq); - if (SCX_HAS_OP(update_idle)) { + if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) { SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); if (!static_branch_unlikely(&scx_builtin_idle_enabled)) return; @@ -3217,6 +3628,9 @@ static void handle_hotplug(struct rq *rq, bool online) atomic_long_inc(&scx_hotplug_seq); + if (scx_enabled()) + update_selcpu_topology(); + if (online && SCX_HAS_OP(cpu_online)) SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu); else if (!online && SCX_HAS_OP(cpu_offline)) @@ -3512,12 +3926,7 @@ static void scx_ops_exit_task(struct task_struct *p) void init_scx_entity(struct sched_ext_entity *scx) { - /* - * init_idle() calls this function again after fork sequence is - * complete. Don't touch ->tasks_node as it's already linked. - */ - memset(scx, 0, offsetof(struct sched_ext_entity, tasks_node)); - + memset(scx, 0, sizeof(*scx)); INIT_LIST_HEAD(&scx->dsq_list.node); RB_CLEAR_NODE(&scx->dsq_priq); scx->sticky_cpu = -1; @@ -3543,7 +3952,7 @@ int scx_fork(struct task_struct *p) { percpu_rwsem_assert_held(&scx_fork_rwsem); - if (scx_enabled()) + if (scx_ops_init_task_enabled) return scx_ops_init_task(p, task_group(p), true); else return 0; @@ -3551,7 +3960,7 @@ int scx_fork(struct task_struct *p) void scx_post_fork(struct task_struct *p) { - if (scx_enabled()) { + if (scx_ops_init_task_enabled) { scx_set_task_state(p, SCX_TASK_READY); /* @@ -3683,6 +4092,7 @@ bool scx_can_stop_tick(struct rq *rq) #ifdef CONFIG_EXT_GROUP_SCHED DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); +static bool scx_cgroup_enabled; static bool cgroup_warned_missing_weight; static bool cgroup_warned_missing_idle; @@ -3702,8 +4112,7 @@ static void scx_cgroup_warn_missing_weight(struct task_group *tg) static void scx_cgroup_warn_missing_idle(struct task_group *tg) { - if (scx_ops_enable_state() == SCX_OPS_DISABLED || - cgroup_warned_missing_idle) + if (!scx_cgroup_enabled || cgroup_warned_missing_idle) return; if (!tg->idle) @@ -3724,15 +4133,18 @@ int scx_tg_online(struct task_group *tg) scx_cgroup_warn_missing_weight(tg); - if (SCX_HAS_OP(cgroup_init)) { - struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; + if (scx_cgroup_enabled) { + if (SCX_HAS_OP(cgroup_init)) { + struct scx_cgroup_init_args args = + { .weight = tg->scx_weight }; - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, - tg->css.cgroup, &args); - if (!ret) + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, + tg->css.cgroup, &args); + if (ret) + ret = ops_sanitize_err("cgroup_init", ret); + } + if (ret == 0) tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED; - else - ret = ops_sanitize_err("cgroup_init", ret); } else { tg->scx_flags |= SCX_TG_ONLINE; } @@ -3763,7 +4175,7 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) /* released in scx_finish/cancel_attach() */ percpu_down_read(&scx_cgroup_rwsem); - if (!scx_enabled()) + if (!scx_cgroup_enabled) return 0; cgroup_taskset_for_each(p, css, tset) { @@ -3806,7 +4218,7 @@ err: void scx_move_task(struct task_struct *p) { - if (!scx_enabled()) + if (!scx_cgroup_enabled) return; /* @@ -3842,7 +4254,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; struct task_struct *p; - if (!scx_enabled()) + if (!scx_cgroup_enabled) goto out_unlock; cgroup_taskset_for_each(p, css, tset) { @@ -3859,7 +4271,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) { percpu_down_read(&scx_cgroup_rwsem); - if (tg->scx_weight != weight) { + if (scx_cgroup_enabled && tg->scx_weight != weight) { if (SCX_HAS_OP(cgroup_set_weight)) SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, tg_cgrp(tg), weight); @@ -4031,6 +4443,8 @@ static void scx_cgroup_exit(void) percpu_rwsem_assert_held(&scx_cgroup_rwsem); + scx_cgroup_enabled = false; + /* * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk * cgroups and exit all the inited ones, all online cgroups are exited. @@ -4097,6 +4511,7 @@ static int scx_cgroup_init(void) css->cgroup, &args); if (ret) { css_put(css); + scx_ops_error("ops.cgroup_init() failed (%d)", ret); return ret; } tg->scx_flags |= SCX_TG_INITED; @@ -4106,6 +4521,9 @@ static int scx_cgroup_init(void) } rcu_read_unlock(); + WARN_ON_ONCE(scx_cgroup_enabled); + scx_cgroup_enabled = true; + return 0; } @@ -4154,11 +4572,19 @@ static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj, } SCX_ATTR(hotplug_seq); +static ssize_t scx_attr_enable_seq_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_enable_seq)); +} +SCX_ATTR(enable_seq); + static struct attribute *scx_global_attrs[] = { &scx_attr_state.attr, &scx_attr_switch_all.attr, &scx_attr_nr_rejected.attr, &scx_attr_hotplug_seq.attr, + &scx_attr_enable_seq.attr, NULL, }; @@ -4203,14 +4629,57 @@ static const struct kset_uevent_ops scx_uevent_ops = { * Used by sched_fork() and __setscheduler_prio() to pick the matching * sched_class. dl/rt are already handled. */ -bool task_should_scx(struct task_struct *p) +bool task_should_scx(int policy) { if (!scx_enabled() || unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING)) return false; if (READ_ONCE(scx_switching_all)) return true; - return p->policy == SCHED_EXT; + return policy == SCHED_EXT; +} + +/** + * scx_softlockup - sched_ext softlockup handler + * + * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can + * live-lock the system by making many CPUs target the same DSQ to the point + * where soft-lockup detection triggers. This function is called from + * soft-lockup watchdog when the triggering point is close and tries to unjam + * the system by enabling the breather and aborting the BPF scheduler. + */ +void scx_softlockup(u32 dur_s) +{ + switch (scx_ops_enable_state()) { + case SCX_OPS_ENABLING: + case SCX_OPS_ENABLED: + break; + default: + return; + } + + /* allow only one instance, cleared at the end of scx_ops_bypass() */ + if (test_and_set_bit(0, &scx_in_softlockup)) + return; + + printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n", + smp_processor_id(), dur_s, scx_ops.name); + + /* + * Some CPUs may be trapped in the dispatch paths. Enable breather + * immediately; otherwise, we might even be able to get to + * scx_ops_bypass(). + */ + atomic_inc(&scx_ops_breather_depth); + + scx_ops_error("soft lockup - CPU#%d stuck for %us", + smp_processor_id(), dur_s); +} + +static void scx_clear_softlockup(void) +{ + if (test_and_clear_bit(0, &scx_in_softlockup)) + atomic_dec(&scx_ops_breather_depth); } /** @@ -4225,38 +4694,45 @@ bool task_should_scx(struct task_struct *p) * the DISABLING state and then cycling the queued tasks through dequeue/enqueue * to force global FIFO scheduling. * - * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order. - * %SCX_OPS_ENQ_LAST is also ignored. + * - ops.select_cpu() is ignored and the default select_cpu() is used. + * + * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order. + * %SCX_OPS_ENQ_LAST is also ignored. * - * b. ops.dispatch() is ignored. + * - ops.dispatch() is ignored. * - * c. balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice - * can't be trusted. Whenever a tick triggers, the running task is rotated to - * the tail of the queue with core_sched_at touched. + * - balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice + * can't be trusted. Whenever a tick triggers, the running task is rotated to + * the tail of the queue with core_sched_at touched. * - * d. pick_next_task() suppresses zero slice warning. + * - pick_next_task() suppresses zero slice warning. * - * e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM - * operations. + * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM + * operations. * - * f. scx_prio_less() reverts to the default core_sched_at order. + * - scx_prio_less() reverts to the default core_sched_at order. */ static void scx_ops_bypass(bool bypass) { - int depth, cpu; + static DEFINE_RAW_SPINLOCK(bypass_lock); + int cpu; + unsigned long flags; + raw_spin_lock_irqsave(&bypass_lock, flags); if (bypass) { - depth = atomic_inc_return(&scx_ops_bypass_depth); - WARN_ON_ONCE(depth <= 0); - if (depth != 1) - return; + scx_ops_bypass_depth++; + WARN_ON_ONCE(scx_ops_bypass_depth <= 0); + if (scx_ops_bypass_depth != 1) + goto unlock; } else { - depth = atomic_dec_return(&scx_ops_bypass_depth); - WARN_ON_ONCE(depth < 0); - if (depth != 0) - return; + scx_ops_bypass_depth--; + WARN_ON_ONCE(scx_ops_bypass_depth < 0); + if (scx_ops_bypass_depth != 0) + goto unlock; } + atomic_inc(&scx_ops_breather_depth); + /* * No task property is changing. We just need to make sure all currently * queued tasks are re-queued according to the new scx_rq_bypassing() @@ -4271,7 +4747,7 @@ static void scx_ops_bypass(bool bypass) struct rq_flags rf; struct task_struct *p, *n; - rq_lock_irqsave(rq, &rf); + rq_lock(rq, &rf); if (bypass) { WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING); @@ -4307,11 +4783,16 @@ static void scx_ops_bypass(bool bypass) sched_enq_and_set_task(&ctx); } - rq_unlock_irqrestore(rq, &rf); + rq_unlock(rq, &rf); - /* kick to restore ticks */ + /* resched to restore ticks and idle state */ resched_cpu(cpu); } + + atomic_dec(&scx_ops_breather_depth); +unlock: + raw_spin_unlock_irqrestore(&bypass_lock, flags); + scx_clear_softlockup(); } static void free_exit_info(struct scx_exit_info *ei) @@ -4416,27 +4897,34 @@ static void scx_ops_disable_workfn(struct kthread_work *work) WRITE_ONCE(scx_switching_all, false); /* - * Avoid racing against fork and cgroup changes. See scx_ops_enable() - * for explanation on the locking order. + * Shut down cgroup support before tasks so that the cgroup attach path + * doesn't race against scx_ops_exit_task(). */ - percpu_down_write(&scx_fork_rwsem); - cpus_read_lock(); scx_cgroup_lock(); + scx_cgroup_exit(); + scx_cgroup_unlock(); - spin_lock_irq(&scx_tasks_lock); - scx_task_iter_init(&sti); /* * The BPF scheduler is going away. All tasks including %TASK_DEAD ones * must be switched out and exited synchronously. */ + percpu_down_write(&scx_fork_rwsem); + + scx_ops_init_task_enabled = false; + + scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; + const struct sched_class *new_class = + __setscheduler_class(p->policy, p->prio); struct sched_enq_and_set_ctx ctx; + if (old_class != new_class && p->se.sched_delayed) + dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL); - __setscheduler_prio(p, p->prio); + p->sched_class = new_class; check_class_changing(task_rq(p), p, old_class); sched_enq_and_set_task(&ctx); @@ -4444,33 +4932,28 @@ static void scx_ops_disable_workfn(struct kthread_work *work) check_class_changed(task_rq(p), p, old_class, p->prio); scx_ops_exit_task(p); } - scx_task_iter_exit(&sti); - spin_unlock_irq(&scx_tasks_lock); + scx_task_iter_stop(&sti); + percpu_up_write(&scx_fork_rwsem); /* no task is on scx, turn off all the switches and flush in-progress calls */ - static_branch_disable_cpuslocked(&__scx_ops_enabled); + static_branch_disable(&__scx_ops_enabled); for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) - static_branch_disable_cpuslocked(&scx_has_op[i]); - static_branch_disable_cpuslocked(&scx_ops_enq_last); - static_branch_disable_cpuslocked(&scx_ops_enq_exiting); - static_branch_disable_cpuslocked(&scx_ops_cpu_preempt); - static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); + static_branch_disable(&scx_has_op[i]); + static_branch_disable(&scx_ops_enq_last); + static_branch_disable(&scx_ops_enq_exiting); + static_branch_disable(&scx_ops_cpu_preempt); + static_branch_disable(&scx_builtin_idle_enabled); synchronize_rcu(); - scx_cgroup_exit(); - - scx_cgroup_unlock(); - cpus_read_unlock(); - percpu_up_write(&scx_fork_rwsem); - if (ei->kind >= SCX_EXIT_ERROR) { pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", scx_ops.name, ei->reason); if (ei->msg[0] != '\0') pr_err("sched_ext: %s: %s\n", scx_ops.name, ei->msg); - +#ifdef CONFIG_STACKTRACE stack_trace_print(ei->bt, ei->bt_len, 2); +#endif } else { pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", scx_ops.name, ei->reason); @@ -4847,10 +5330,10 @@ static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, return; ei->exit_code = exit_code; - +#ifdef CONFIG_STACKTRACE if (kind >= SCX_EXIT_ERROR) ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1); - +#endif va_start(args, fmt); vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args); va_end(args); @@ -4913,11 +5396,11 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) struct scx_task_iter sti; struct task_struct *p; unsigned long timeout; - int i, cpu, ret; + int i, cpu, node, ret; if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), cpu_possible_mask)) { - pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation"); + pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); return -EINVAL; } @@ -4932,6 +5415,34 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) } } + if (!global_dsqs) { + struct scx_dispatch_q **dsqs; + + dsqs = kcalloc(nr_node_ids, sizeof(dsqs[0]), GFP_KERNEL); + if (!dsqs) { + ret = -ENOMEM; + goto err_unlock; + } + + for_each_node_state(node, N_POSSIBLE) { + struct scx_dispatch_q *dsq; + + dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node); + if (!dsq) { + for_each_node_state(node, N_POSSIBLE) + kfree(dsqs[node]); + kfree(dsqs); + ret = -ENOMEM; + goto err_unlock; + } + + init_dsq(dsq, SCX_DSQ_GLOBAL); + dsqs[node] = dsq; + } + + global_dsqs = dsqs; + } + if (scx_ops_enable_state() != SCX_OPS_DISABLED) { ret = -EBUSY; goto err_unlock; @@ -4955,12 +5466,12 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) } /* - * Set scx_ops, transition to PREPPING and clear exit info to arm the + * Set scx_ops, transition to ENABLING and clear exit info to arm the * disable path. Failure triggers full disabling from here on. */ scx_ops = *ops; - WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) != + WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_ENABLING) != SCX_OPS_DISABLED); atomic_set(&scx_exit_kind, SCX_EXIT_NONE); @@ -4981,7 +5492,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init); if (ret) { ret = ops_sanitize_err("init", ret); - goto err_disable_unlock_cpus; + cpus_read_unlock(); + scx_ops_error("ops.init() failed (%d)", ret); + goto err_disable; } } @@ -4989,6 +5502,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (((void (**)(void))ops)[i]) static_branch_enable_cpuslocked(&scx_has_op[i]); + check_hotplug_seq(ops); +#ifdef CONFIG_SMP + update_selcpu_topology(); +#endif cpus_read_unlock(); ret = validate_ops(ops); @@ -5016,57 +5533,40 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_watchdog_timeout / 2); /* - * Lock out forks, cgroup on/offlining and moves before opening the - * floodgate so that they don't wander into the operations prematurely. - * - * We don't need to keep the CPUs stable but static_branch_*() requires - * cpus_read_lock() and scx_cgroup_rwsem must nest inside - * cpu_hotplug_lock because of the following dependency chain: - * - * cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem - * - * So, we need to do cpus_read_lock() before scx_cgroup_lock() and use - * static_branch_*_cpuslocked(). - * - * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the - * following dependency chain: - * - * scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock + * Once __scx_ops_enabled is set, %current can be switched to SCX + * anytime. This can lead to stalls as some BPF schedulers (e.g. + * userspace scheduling) may not function correctly before all tasks are + * switched. Init in bypass mode to guarantee forward progress. */ - percpu_down_write(&scx_fork_rwsem); - cpus_read_lock(); - scx_cgroup_lock(); - - check_hotplug_seq(ops); + scx_ops_bypass(true); for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) if (((void (**)(void))ops)[i]) - static_branch_enable_cpuslocked(&scx_has_op[i]); + static_branch_enable(&scx_has_op[i]); if (ops->flags & SCX_OPS_ENQ_LAST) - static_branch_enable_cpuslocked(&scx_ops_enq_last); + static_branch_enable(&scx_ops_enq_last); if (ops->flags & SCX_OPS_ENQ_EXITING) - static_branch_enable_cpuslocked(&scx_ops_enq_exiting); + static_branch_enable(&scx_ops_enq_exiting); if (scx_ops.cpu_acquire || scx_ops.cpu_release) - static_branch_enable_cpuslocked(&scx_ops_cpu_preempt); + static_branch_enable(&scx_ops_cpu_preempt); if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { reset_idle_masks(); - static_branch_enable_cpuslocked(&scx_builtin_idle_enabled); + static_branch_enable(&scx_builtin_idle_enabled); } else { - static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); + static_branch_disable(&scx_builtin_idle_enabled); } /* - * All cgroups should be initialized before letting in tasks. cgroup - * on/offlining and task migrations are already locked out. + * Lock out forks, cgroup on/offlining and moves before opening the + * floodgate so that they don't wander into the operations prematurely. */ - ret = scx_cgroup_init(); - if (ret) - goto err_disable_unlock_all; + percpu_down_write(&scx_fork_rwsem); - static_branch_enable_cpuslocked(&__scx_ops_enabled); + WARN_ON_ONCE(scx_ops_init_task_enabled); + scx_ops_init_task_enabled = true; /* * Enable ops for every task. Fork is excluded by scx_fork_rwsem @@ -5074,10 +5574,19 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) * leaving as sched_ext_free() can handle both prepped and enabled * tasks. Prep all tasks first and then enable them with preemption * disabled. + * + * All cgroups should be initialized before scx_ops_init_task() so that + * the BPF scheduler can reliably track each task's cgroup membership + * from scx_ops_init_task(). Lock out cgroup on/offlining and task + * migrations while tasks are being initialized so that + * scx_cgroup_can_attach() never sees uninitialized tasks. */ - spin_lock_irq(&scx_tasks_lock); + scx_cgroup_lock(); + ret = scx_cgroup_init(); + if (ret) + goto err_disable_unlock_all; - scx_task_iter_init(&sti); + scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { /* * @p may already be dead, have lost all its usages counts and @@ -5087,84 +5596,67 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (!tryget_task_struct(p)) continue; - scx_task_iter_rq_unlock(&sti); - spin_unlock_irq(&scx_tasks_lock); + scx_task_iter_unlock(&sti); ret = scx_ops_init_task(p, task_group(p), false); if (ret) { put_task_struct(p); - spin_lock_irq(&scx_tasks_lock); - scx_task_iter_exit(&sti); - spin_unlock_irq(&scx_tasks_lock); - pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n", - ret, p->comm, p->pid); + scx_task_iter_relock(&sti); + scx_task_iter_stop(&sti); + scx_ops_error("ops.init_task() failed (%d) for %s[%d]", + ret, p->comm, p->pid); goto err_disable_unlock_all; } + scx_set_task_state(p, SCX_TASK_READY); + put_task_struct(p); - spin_lock_irq(&scx_tasks_lock); + scx_task_iter_relock(&sti); } - scx_task_iter_exit(&sti); + scx_task_iter_stop(&sti); + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); /* - * All tasks are prepped but are still ops-disabled. Ensure that - * %current can't be scheduled out and switch everyone. - * preempt_disable() is necessary because we can't guarantee that - * %current won't be starved if scheduled out while switching. + * All tasks are READY. It's safe to turn on scx_enabled() and switch + * all eligible tasks. */ - preempt_disable(); - - /* - * From here on, the disable path must assume that tasks have ops - * enabled and need to be recovered. - * - * Transition to ENABLING fails iff the BPF scheduler has already - * triggered scx_bpf_error(). Returning an error code here would lose - * the recorded error information. Exit indicating success so that the - * error is notified through ops.exit() with all the details. - */ - if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLING, SCX_OPS_PREPPING)) { - preempt_enable(); - spin_unlock_irq(&scx_tasks_lock); - WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); - ret = 0; - goto err_disable_unlock_all; - } + WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); + static_branch_enable(&__scx_ops_enabled); /* - * We're fully committed and can't fail. The PREPPED -> ENABLED + * We're fully committed and can't fail. The task READY -> ENABLED * transitions here are synchronized against sched_ext_free() through * scx_tasks_lock. */ - WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); - - scx_task_iter_init(&sti); + percpu_down_write(&scx_fork_rwsem); + scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; + const struct sched_class *new_class = + __setscheduler_class(p->policy, p->prio); struct sched_enq_and_set_ctx ctx; + if (old_class != new_class && p->se.sched_delayed) + dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - scx_set_task_state(p, SCX_TASK_READY); - __setscheduler_prio(p, p->prio); + p->scx.slice = SCX_SLICE_DFL; + p->sched_class = new_class; check_class_changing(task_rq(p), p, old_class); sched_enq_and_set_task(&ctx); check_class_changed(task_rq(p), p, old_class, p->prio); } - scx_task_iter_exit(&sti); - - spin_unlock_irq(&scx_tasks_lock); - preempt_enable(); - scx_cgroup_unlock(); - cpus_read_unlock(); + scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); - /* see above ENABLING transition for the explanation on exiting with 0 */ + scx_ops_bypass(false); + if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); - ret = 0; goto err_disable; } @@ -5176,6 +5668,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) kobject_uevent(scx_root_kobj, KOBJ_ADD); mutex_unlock(&scx_ops_enable_mutex); + atomic_long_inc(&scx_enable_seq); + return 0; err_del: @@ -5194,14 +5688,21 @@ err_unlock: err_disable_unlock_all: scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); -err_disable_unlock_cpus: - cpus_read_unlock(); + scx_ops_bypass(false); err_disable: mutex_unlock(&scx_ops_enable_mutex); - /* must be fully disabled before returning */ - scx_ops_disable(SCX_EXIT_ERROR); + /* + * Returning an error code here would not pass all the error information + * to userspace. Record errno using scx_ops_error() for cases + * scx_ops_error() wasn't already invoked and exit indicating success so + * that the error is notified through ops.exit() with all the details. + * + * Flush scx_ops_disable_work to ensure that error is reported before + * init completion. + */ + scx_ops_error("scx_ops_enable() failed (%d)", ret); kthread_flush_work(&scx_ops_disable_work); - return ret; + return 0; } @@ -5212,67 +5713,7 @@ err_disable: #include <linux/bpf.h> #include <linux/btf.h> -extern struct btf *btf_vmlinux; static const struct btf_type *task_struct_type; -static u32 task_struct_type_id; - -static bool set_arg_maybe_null(const char *op, int arg_n, int off, int size, - enum bpf_access_type type, - const struct bpf_prog *prog, - struct bpf_insn_access_aux *info) -{ - struct btf *btf = bpf_get_btf_vmlinux(); - const struct bpf_struct_ops_desc *st_ops_desc; - const struct btf_member *member; - const struct btf_type *t; - u32 btf_id, member_idx; - const char *mname; - - /* struct_ops op args are all sequential, 64-bit numbers */ - if (off != arg_n * sizeof(__u64)) - return false; - - /* btf_id should be the type id of struct sched_ext_ops */ - btf_id = prog->aux->attach_btf_id; - st_ops_desc = bpf_struct_ops_find(btf, btf_id); - if (!st_ops_desc) - return false; - - /* BTF type of struct sched_ext_ops */ - t = st_ops_desc->type; - - member_idx = prog->expected_attach_type; - if (member_idx >= btf_type_vlen(t)) - return false; - - /* - * Get the member name of this struct_ops program, which corresponds to - * a field in struct sched_ext_ops. For example, the member name of the - * dispatch struct_ops program (callback) is "dispatch". - */ - member = &btf_type_member(t)[member_idx]; - mname = btf_name_by_offset(btf_vmlinux, member->name_off); - - if (!strcmp(mname, op)) { - /* - * The value is a pointer to a type (struct task_struct) given - * by a BTF ID (PTR_TO_BTF_ID). It is trusted (PTR_TRUSTED), - * however, can be a NULL (PTR_MAYBE_NULL). The BPF program - * should check the pointer to make sure it is not NULL before - * using it, or the verifier will reject the program. - * - * Longer term, this is something that should be addressed by - * BTF, and be fully contained within the verifier. - */ - info->reg_type = PTR_MAYBE_NULL | PTR_TO_BTF_ID | PTR_TRUSTED; - info->btf = btf_vmlinux; - info->btf_id = task_struct_type_id; - - return true; - } - - return false; -} static bool bpf_scx_is_valid_access(int off, int size, enum bpf_access_type type, @@ -5281,9 +5722,6 @@ static bool bpf_scx_is_valid_access(int off, int size, { if (type != BPF_READ) return false; - if (set_arg_maybe_null("dispatch", 1, off, size, type, prog, info) || - set_arg_maybe_null("yield", 1, off, size, type, prog, info)) - return true; if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) return false; if (off % size != 0) @@ -5418,13 +5856,7 @@ static void bpf_scx_unreg(void *kdata, struct bpf_link *link) static int bpf_scx_init(struct btf *btf) { - s32 type_id; - - type_id = btf_find_by_name_kind(btf, "task_struct", BTF_KIND_STRUCT); - if (type_id < 0) - return -EINVAL; - task_struct_type = btf_type_by_id(btf, type_id); - task_struct_type_id = type_id; + task_struct_type = btf_type_by_id(btf, btf_tracing_ids[BTF_TRACING_TYPE_TASK]); return 0; } @@ -5446,78 +5878,78 @@ static int bpf_scx_validate(void *kdata) return 0; } -static s32 select_cpu_stub(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; } -static void enqueue_stub(struct task_struct *p, u64 enq_flags) {} -static void dequeue_stub(struct task_struct *p, u64 enq_flags) {} -static void dispatch_stub(s32 prev_cpu, struct task_struct *p) {} -static void tick_stub(struct task_struct *p) {} -static void runnable_stub(struct task_struct *p, u64 enq_flags) {} -static void running_stub(struct task_struct *p) {} -static void stopping_stub(struct task_struct *p, bool runnable) {} -static void quiescent_stub(struct task_struct *p, u64 deq_flags) {} -static bool yield_stub(struct task_struct *from, struct task_struct *to) { return false; } -static bool core_sched_before_stub(struct task_struct *a, struct task_struct *b) { return false; } -static void set_weight_stub(struct task_struct *p, u32 weight) {} -static void set_cpumask_stub(struct task_struct *p, const struct cpumask *mask) {} -static void update_idle_stub(s32 cpu, bool idle) {} -static void cpu_acquire_stub(s32 cpu, struct scx_cpu_acquire_args *args) {} -static void cpu_release_stub(s32 cpu, struct scx_cpu_release_args *args) {} -static s32 init_task_stub(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; } -static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {} -static void enable_stub(struct task_struct *p) {} -static void disable_stub(struct task_struct *p) {} +static s32 sched_ext_ops__select_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; } +static void sched_ext_ops__enqueue(struct task_struct *p, u64 enq_flags) {} +static void sched_ext_ops__dequeue(struct task_struct *p, u64 enq_flags) {} +static void sched_ext_ops__dispatch(s32 prev_cpu, struct task_struct *prev__nullable) {} +static void sched_ext_ops__tick(struct task_struct *p) {} +static void sched_ext_ops__runnable(struct task_struct *p, u64 enq_flags) {} +static void sched_ext_ops__running(struct task_struct *p) {} +static void sched_ext_ops__stopping(struct task_struct *p, bool runnable) {} +static void sched_ext_ops__quiescent(struct task_struct *p, u64 deq_flags) {} +static bool sched_ext_ops__yield(struct task_struct *from, struct task_struct *to__nullable) { return false; } +static bool sched_ext_ops__core_sched_before(struct task_struct *a, struct task_struct *b) { return false; } +static void sched_ext_ops__set_weight(struct task_struct *p, u32 weight) {} +static void sched_ext_ops__set_cpumask(struct task_struct *p, const struct cpumask *mask) {} +static void sched_ext_ops__update_idle(s32 cpu, bool idle) {} +static void sched_ext_ops__cpu_acquire(s32 cpu, struct scx_cpu_acquire_args *args) {} +static void sched_ext_ops__cpu_release(s32 cpu, struct scx_cpu_release_args *args) {} +static s32 sched_ext_ops__init_task(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; } +static void sched_ext_ops__exit_task(struct task_struct *p, struct scx_exit_task_args *args) {} +static void sched_ext_ops__enable(struct task_struct *p) {} +static void sched_ext_ops__disable(struct task_struct *p) {} #ifdef CONFIG_EXT_GROUP_SCHED -static s32 cgroup_init_stub(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; } -static void cgroup_exit_stub(struct cgroup *cgrp) {} -static s32 cgroup_prep_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; } -static void cgroup_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} -static void cgroup_cancel_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} -static void cgroup_set_weight_stub(struct cgroup *cgrp, u32 weight) {} +static s32 sched_ext_ops__cgroup_init(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; } +static void sched_ext_ops__cgroup_exit(struct cgroup *cgrp) {} +static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; } +static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} +static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} +static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} #endif -static void cpu_online_stub(s32 cpu) {} -static void cpu_offline_stub(s32 cpu) {} -static s32 init_stub(void) { return -EINVAL; } -static void exit_stub(struct scx_exit_info *info) {} -static void dump_stub(struct scx_dump_ctx *ctx) {} -static void dump_cpu_stub(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {} -static void dump_task_stub(struct scx_dump_ctx *ctx, struct task_struct *p) {} +static void sched_ext_ops__cpu_online(s32 cpu) {} +static void sched_ext_ops__cpu_offline(s32 cpu) {} +static s32 sched_ext_ops__init(void) { return -EINVAL; } +static void sched_ext_ops__exit(struct scx_exit_info *info) {} +static void sched_ext_ops__dump(struct scx_dump_ctx *ctx) {} +static void sched_ext_ops__dump_cpu(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {} +static void sched_ext_ops__dump_task(struct scx_dump_ctx *ctx, struct task_struct *p) {} static struct sched_ext_ops __bpf_ops_sched_ext_ops = { - .select_cpu = select_cpu_stub, - .enqueue = enqueue_stub, - .dequeue = dequeue_stub, - .dispatch = dispatch_stub, - .tick = tick_stub, - .runnable = runnable_stub, - .running = running_stub, - .stopping = stopping_stub, - .quiescent = quiescent_stub, - .yield = yield_stub, - .core_sched_before = core_sched_before_stub, - .set_weight = set_weight_stub, - .set_cpumask = set_cpumask_stub, - .update_idle = update_idle_stub, - .cpu_acquire = cpu_acquire_stub, - .cpu_release = cpu_release_stub, - .init_task = init_task_stub, - .exit_task = exit_task_stub, - .enable = enable_stub, - .disable = disable_stub, + .select_cpu = sched_ext_ops__select_cpu, + .enqueue = sched_ext_ops__enqueue, + .dequeue = sched_ext_ops__dequeue, + .dispatch = sched_ext_ops__dispatch, + .tick = sched_ext_ops__tick, + .runnable = sched_ext_ops__runnable, + .running = sched_ext_ops__running, + .stopping = sched_ext_ops__stopping, + .quiescent = sched_ext_ops__quiescent, + .yield = sched_ext_ops__yield, + .core_sched_before = sched_ext_ops__core_sched_before, + .set_weight = sched_ext_ops__set_weight, + .set_cpumask = sched_ext_ops__set_cpumask, + .update_idle = sched_ext_ops__update_idle, + .cpu_acquire = sched_ext_ops__cpu_acquire, + .cpu_release = sched_ext_ops__cpu_release, + .init_task = sched_ext_ops__init_task, + .exit_task = sched_ext_ops__exit_task, + .enable = sched_ext_ops__enable, + .disable = sched_ext_ops__disable, #ifdef CONFIG_EXT_GROUP_SCHED - .cgroup_init = cgroup_init_stub, - .cgroup_exit = cgroup_exit_stub, - .cgroup_prep_move = cgroup_prep_move_stub, - .cgroup_move = cgroup_move_stub, - .cgroup_cancel_move = cgroup_cancel_move_stub, - .cgroup_set_weight = cgroup_set_weight_stub, + .cgroup_init = sched_ext_ops__cgroup_init, + .cgroup_exit = sched_ext_ops__cgroup_exit, + .cgroup_prep_move = sched_ext_ops__cgroup_prep_move, + .cgroup_move = sched_ext_ops__cgroup_move, + .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, + .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, #endif - .cpu_online = cpu_online_stub, - .cpu_offline = cpu_offline_stub, - .init = init_stub, - .exit = exit_stub, - .dump = dump_stub, - .dump_cpu = dump_cpu_stub, - .dump_task = dump_task_stub, + .cpu_online = sched_ext_ops__cpu_online, + .cpu_offline = sched_ext_ops__cpu_offline, + .init = sched_ext_ops__init, + .exit = sched_ext_ops__exit, + .dump = sched_ext_ops__dump, + .dump_cpu = sched_ext_ops__dump_cpu, + .dump_task = sched_ext_ops__dump_task, }; static struct bpf_struct_ops bpf_sched_ext_ops = { @@ -5664,7 +6096,7 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work) if (cpu != cpu_of(this_rq)) { /* * Pairs with smp_store_release() issued by this CPU in - * scx_next_task_picked() on the resched path. + * switch_class() on the resched path. * * We busy-wait here to guarantee that no other task can * be scheduled on our core before the target CPU has @@ -5764,7 +6196,6 @@ void __init init_sched_ext_class(void) SCX_TG_ONLINE); BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); - init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL); #ifdef CONFIG_SMP BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); @@ -5822,16 +6253,21 @@ __bpf_kfunc_start_defs(); __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) { - if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) { - *is_idle = false; - return prev_cpu; + if (!static_branch_likely(&scx_builtin_idle_enabled)) { + scx_ops_error("built-in idle tracking is disabled"); + goto prev_cpu; } + + if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) + goto prev_cpu; + #ifdef CONFIG_SMP return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); -#else +#endif + +prev_cpu: *is_idle = false; return prev_cpu; -#endif } __bpf_kfunc_end_defs(); @@ -5845,7 +6281,7 @@ static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { .set = &scx_kfunc_ids_select_cpu, }; -static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags) +static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags) { if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) return false; @@ -5865,7 +6301,8 @@ static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags) return true; } -static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags) +static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id, + u64 enq_flags) { struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); struct task_struct *ddsp_task; @@ -5892,14 +6329,14 @@ static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags __bpf_kfunc_start_defs(); /** - * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ - * @p: task_struct to dispatch - * @dsq_id: DSQ to dispatch to + * scx_bpf_dsq_insert - Insert a task into the FIFO queue of a DSQ + * @p: task_struct to insert + * @dsq_id: DSQ to insert into * @slice: duration @p can run for in nsecs, 0 to keep the current value * @enq_flags: SCX_ENQ_* * - * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe - * to call this function spuriously. Can be called from ops.enqueue(), + * Insert @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe to + * call this function spuriously. Can be called from ops.enqueue(), * ops.select_cpu(), and ops.dispatch(). * * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch @@ -5908,14 +6345,14 @@ __bpf_kfunc_start_defs(); * ops.select_cpu() to be on the target CPU in the first place. * * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p - * will be directly dispatched to the corresponding dispatch queue after - * ops.select_cpu() returns. If @p is dispatched to SCX_DSQ_LOCAL, it will be - * dispatched to the local DSQ of the CPU returned by ops.select_cpu(). + * will be directly inserted into the corresponding dispatch queue after + * ops.select_cpu() returns. If @p is inserted into SCX_DSQ_LOCAL, it will be + * inserted into the local DSQ of the CPU returned by ops.select_cpu(). * @enq_flags are OR'd with the enqueue flags on the enqueue path before the - * task is dispatched. + * task is inserted. * * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id - * and this function can be called upto ops.dispatch_max_batch times to dispatch + * and this function can be called upto ops.dispatch_max_batch times to insert * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the * remaining slots. scx_bpf_consume() flushes the batch and resets the counter. * @@ -5927,10 +6364,10 @@ __bpf_kfunc_start_defs(); * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with * scx_bpf_kick_cpu() to trigger scheduling. */ -__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, - u64 enq_flags) +__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, + u64 enq_flags) { - if (!scx_dispatch_preamble(p, enq_flags)) + if (!scx_dsq_insert_preamble(p, enq_flags)) return; if (slice) @@ -5938,30 +6375,42 @@ __bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, else p->scx.slice = p->scx.slice ?: 1; - scx_dispatch_commit(p, dsq_id, enq_flags); + scx_dsq_insert_commit(p, dsq_id, enq_flags); +} + +/* for backward compatibility, will be removed in v6.15 */ +__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, + u64 enq_flags) +{ + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()"); + scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags); } /** - * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ - * @p: task_struct to dispatch - * @dsq_id: DSQ to dispatch to + * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ + * @p: task_struct to insert + * @dsq_id: DSQ to insert into * @slice: duration @p can run for in nsecs, 0 to keep the current value * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ * @enq_flags: SCX_ENQ_* * - * Dispatch @p into the vtime priority queue of the DSQ identified by @dsq_id. - * Tasks queued into the priority queue are ordered by @vtime and always - * consumed after the tasks in the FIFO queue. All other aspects are identical - * to scx_bpf_dispatch(). + * Insert @p into the vtime priority queue of the DSQ identified by @dsq_id. + * Tasks queued into the priority queue are ordered by @vtime. All other aspects + * are identical to scx_bpf_dsq_insert(). * * @vtime ordering is according to time_before64() which considers wrapping. A * numerically larger vtime may indicate an earlier position in the ordering and * vice-versa. + * + * A DSQ can only be used as a FIFO or priority queue at any given time and this + * function must not be called on a DSQ which already has one or more FIFO tasks + * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and + * SCX_DSQ_GLOBAL) cannot be used as priority queues. */ -__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, - u64 slice, u64 vtime, u64 enq_flags) +__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, + u64 slice, u64 vtime, u64 enq_flags) { - if (!scx_dispatch_preamble(p, enq_flags)) + if (!scx_dsq_insert_preamble(p, enq_flags)) return; if (slice) @@ -5971,12 +6420,22 @@ __bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, p->scx.dsq_vtime = vtime; - scx_dispatch_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); + scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); +} + +/* for backward compatibility, will be removed in v6.15 */ +__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, + u64 slice, u64 vtime, u64 enq_flags) +{ + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()"); + scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags); } __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) +BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) @@ -5986,12 +6445,11 @@ static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { .set = &scx_kfunc_ids_enqueue_dispatch, }; -static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit, - struct task_struct *p, u64 dsq_id, - u64 enq_flags) +static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, + struct task_struct *p, u64 dsq_id, u64 enq_flags) { struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq; - struct rq *this_rq, *src_rq, *dst_rq, *locked_rq; + struct rq *this_rq, *src_rq, *locked_rq; bool dispatched = false; bool in_balance; unsigned long flags; @@ -6019,6 +6477,13 @@ static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit, raw_spin_rq_lock(src_rq); } + /* + * If the BPF scheduler keeps calling this function repeatedly, it can + * cause similar live-lock conditions as consume_dispatch_q(). Insert a + * breather if necessary. + */ + scx_ops_breather(src_rq); + locked_rq = src_rq; raw_spin_lock(&src_dsq->lock); @@ -6037,51 +6502,18 @@ static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit, /* @p is still on $src_dsq and stable, determine the destination */ dst_dsq = find_dsq_for_dispatch(this_rq, dsq_id, p); - if (dst_dsq->id == SCX_DSQ_LOCAL) { - dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); - if (!task_can_run_on_remote_rq(p, dst_rq, true)) { - dst_dsq = &scx_dsq_global; - dst_rq = src_rq; - } - } else { - /* no need to migrate if destination is a non-local DSQ */ - dst_rq = src_rq; - } - /* - * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different - * CPU, @p will be migrated. + * Apply vtime and slice updates before moving so that the new time is + * visible before inserting into $dst_dsq. @p is still on $src_dsq but + * this is safe as we're locking it. */ - if (dst_dsq->id == SCX_DSQ_LOCAL) { - /* @p is going from a non-local DSQ to a local DSQ */ - if (src_rq == dst_rq) { - task_unlink_from_dsq(p, src_dsq); - move_local_task_to_local_dsq(p, enq_flags, - src_dsq, dst_rq); - raw_spin_unlock(&src_dsq->lock); - } else { - raw_spin_unlock(&src_dsq->lock); - move_remote_task_to_local_dsq(p, enq_flags, - src_rq, dst_rq); - locked_rq = dst_rq; - } - } else { - /* - * @p is going from a non-local DSQ to a non-local DSQ. As - * $src_dsq is already locked, do an abbreviated dequeue. - */ - task_unlink_from_dsq(p, src_dsq); - p->scx.dsq = NULL; - raw_spin_unlock(&src_dsq->lock); - - if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME) - p->scx.dsq_vtime = kit->vtime; - dispatch_enqueue(dst_dsq, p, enq_flags); - } - + if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME) + p->scx.dsq_vtime = kit->vtime; if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE) p->scx.slice = kit->slice; + /* execute move */ + locked_rq = move_task_between_dsqs(p, enq_flags, src_dsq, dst_dsq); dispatched = true; out: if (in_balance) { @@ -6133,21 +6565,20 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void) } /** - * scx_bpf_consume - Transfer a task from a DSQ to the current CPU's local DSQ - * @dsq_id: DSQ to consume + * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ + * @dsq_id: DSQ to move task from * - * Consume a task from the non-local DSQ identified by @dsq_id and transfer it - * to the current CPU's local DSQ for execution. Can only be called from - * ops.dispatch(). + * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's + * local DSQ for execution. Can only be called from ops.dispatch(). * - * This function flushes the in-flight dispatches from scx_bpf_dispatch() before - * trying to consume the specified DSQ. It may also grab rq locks and thus can't - * be called under any BPF locks. + * This function flushes the in-flight dispatches from scx_bpf_dsq_insert() + * before trying to move from the specified DSQ. It may also grab rq locks and + * thus can't be called under any BPF locks. * - * Returns %true if a task has been consumed, %false if there isn't any task to - * consume. + * Returns %true if a task has been moved, %false if there isn't any task to + * move. */ -__bpf_kfunc bool scx_bpf_consume(u64 dsq_id) +__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id) { struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); struct scx_dispatch_q *dsq; @@ -6157,7 +6588,7 @@ __bpf_kfunc bool scx_bpf_consume(u64 dsq_id) flush_dispatch_buf(dspc->rq); - dsq = find_non_local_dsq(dsq_id); + dsq = find_user_dsq(dsq_id); if (unlikely(!dsq)) { scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id); return false; @@ -6177,17 +6608,24 @@ __bpf_kfunc bool scx_bpf_consume(u64 dsq_id) } } +/* for backward compatibility, will be removed in v6.15 */ +__bpf_kfunc bool scx_bpf_consume(u64 dsq_id) +{ + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()"); + return scx_bpf_dsq_move_to_local(dsq_id); +} + /** - * scx_bpf_dispatch_from_dsq_set_slice - Override slice when dispatching from DSQ + * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs * @it__iter: DSQ iterator in progress - * @slice: duration the dispatched task can run for in nsecs + * @slice: duration the moved task can run for in nsecs * - * Override the slice of the next task that will be dispatched from @it__iter - * using scx_bpf_dispatch_from_dsq[_vtime](). If this function is not called, - * the previous slice duration is kept. + * Override the slice of the next task that will be moved from @it__iter using + * scx_bpf_dsq_move[_vtime](). If this function is not called, the previous + * slice duration is kept. */ -__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice( - struct bpf_iter_scx_dsq *it__iter, u64 slice) +__bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, + u64 slice) { struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; @@ -6195,18 +6633,26 @@ __bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice( kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE; } +/* for backward compatibility, will be removed in v6.15 */ +__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice( + struct bpf_iter_scx_dsq *it__iter, u64 slice) +{ + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()"); + scx_bpf_dsq_move_set_slice(it__iter, slice); +} + /** - * scx_bpf_dispatch_from_dsq_set_vtime - Override vtime when dispatching from DSQ + * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs * @it__iter: DSQ iterator in progress * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ * - * Override the vtime of the next task that will be dispatched from @it__iter - * using scx_bpf_dispatch_from_dsq_vtime(). If this function is not called, the - * previous slice vtime is kept. If scx_bpf_dispatch_from_dsq() is used to - * dispatch the next task, the override is ignored and cleared. + * Override the vtime of the next task that will be moved from @it__iter using + * scx_bpf_dsq_move_vtime(). If this function is not called, the previous slice + * vtime is kept. If scx_bpf_dsq_move() is used to dispatch the next task, the + * override is ignored and cleared. */ -__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime( - struct bpf_iter_scx_dsq *it__iter, u64 vtime) +__bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, + u64 vtime) { struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; @@ -6214,8 +6660,16 @@ __bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime( kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME; } +/* for backward compatibility, will be removed in v6.15 */ +__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime( + struct bpf_iter_scx_dsq *it__iter, u64 vtime) +{ + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()"); + scx_bpf_dsq_move_set_vtime(it__iter, vtime); +} + /** - * scx_bpf_dispatch_from_dsq - Move a task from DSQ iteration to a DSQ + * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ * @it__iter: DSQ iterator in progress * @p: task to transfer * @dsq_id: DSQ to move @p to @@ -6230,8 +6684,7 @@ __bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime( * @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have * been queued before the iteration started. * - * @p's slice is kept by default. Use scx_bpf_dispatch_from_dsq_set_slice() to - * update. + * @p's slice is kept by default. Use scx_bpf_dsq_move_set_slice() to update. * * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq * lock (e.g. BPF timers or SYSCALL programs). @@ -6239,16 +6692,25 @@ __bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime( * Returns %true if @p has been consumed, %false if @p had already been consumed * or dequeued. */ +__bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, + struct task_struct *p, u64 dsq_id, + u64 enq_flags) +{ + return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter, + p, dsq_id, enq_flags); +} + +/* for backward compatibility, will be removed in v6.15 */ __bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) { - return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter, - p, dsq_id, enq_flags); + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()"); + return scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags); } /** - * scx_bpf_dispatch_vtime_from_dsq - Move a task from DSQ iteration to a PRIQ DSQ + * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ * @it__iter: DSQ iterator in progress * @p: task to transfer * @dsq_id: DSQ to move @p to @@ -6258,19 +6720,27 @@ __bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, * priority queue of the DSQ specified by @dsq_id. The destination must be a * user DSQ as only user DSQs support priority queue. * - * @p's slice and vtime are kept by default. Use - * scx_bpf_dispatch_from_dsq_set_slice() and - * scx_bpf_dispatch_from_dsq_set_vtime() to update. + * @p's slice and vtime are kept by default. Use scx_bpf_dsq_move_set_slice() + * and scx_bpf_dsq_move_set_vtime() to update. * - * All other aspects are identical to scx_bpf_dispatch_from_dsq(). See - * scx_bpf_dispatch_vtime() for more information on @vtime. + * All other aspects are identical to scx_bpf_dsq_move(). See + * scx_bpf_dsq_insert_vtime() for more information on @vtime. */ +__bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, + struct task_struct *p, u64 dsq_id, + u64 enq_flags) +{ + return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter, + p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); +} + +/* for backward compatibility, will be removed in v6.15 */ __bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) { - return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter, - p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_vtime() renamed to scx_bpf_dsq_move_vtime()"); + return scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags); } __bpf_kfunc_end_defs(); @@ -6278,7 +6748,12 @@ __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_dispatch) BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local) BTF_ID_FLAGS(func, scx_bpf_consume) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime) +BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice) BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime) BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU) @@ -6379,6 +6854,12 @@ __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_unlocked) BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime) +BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice) +BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime) BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_unlocked) @@ -6478,7 +6959,7 @@ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) goto out; } } else { - dsq = find_non_local_dsq(dsq_id); + dsq = find_user_dsq(dsq_id); if (dsq) { ret = READ_ONCE(dsq->nr); goto out; @@ -6527,7 +7008,7 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) return -EINVAL; - kit->dsq = find_non_local_dsq(dsq_id); + kit->dsq = find_user_dsq(dsq_id); if (!kit->dsq) return -ENOENT; @@ -7054,15 +7535,8 @@ __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p)) goto out; - /* - * A task_group may either be a cgroup or an autogroup. In the latter - * case, @tg->css.cgroup is %NULL. A task_group can't become the other - * kind once created. - */ - if (tg && tg->css.cgroup) - cgrp = tg->css.cgroup; - else - cgrp = &cgrp_dfl_root.cgrp; + cgrp = tg_cgrp(tg); + out: cgroup_get(cgrp); return cgrp; diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 246019519231..b1675bb59fc4 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -18,7 +18,7 @@ bool scx_can_stop_tick(struct rq *rq); void scx_rq_activate(struct rq *rq); void scx_rq_deactivate(struct rq *rq); int scx_check_setscheduler(struct task_struct *p, int policy); -bool task_should_scx(struct task_struct *p); +bool task_should_scx(int policy); void init_sched_ext_class(void); static inline u32 scx_cpuperf_target(s32 cpu) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 225b31aaee55..fbdca89c677f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1200,12 +1200,12 @@ static inline bool do_preempt_short(struct cfs_rq *cfs_rq, */ s64 update_curr_common(struct rq *rq) { - struct task_struct *curr = rq->curr; + struct task_struct *donor = rq->donor; s64 delta_exec; - delta_exec = update_curr_se(rq, &curr->se); + delta_exec = update_curr_se(rq, &donor->se); if (likely(delta_exec > 0)) - update_curr_task(curr, delta_exec); + update_curr_task(donor, delta_exec); return delta_exec; } @@ -1247,18 +1247,18 @@ static void update_curr(struct cfs_rq *cfs_rq) account_cfs_rq_runtime(cfs_rq, delta_exec); - if (rq->nr_running == 1) + if (cfs_rq->nr_running == 1) return; if (resched || did_preempt_short(cfs_rq, curr)) { - resched_curr(rq); + resched_curr_lazy(rq); clear_buddies(cfs_rq, curr); } } static void update_curr_fair(struct rq *rq) { - update_curr(cfs_rq_of(&rq->curr->se)); + update_curr(cfs_rq_of(&rq->donor->se)); } static inline void @@ -3369,7 +3369,7 @@ retry_pids: vma = vma_next(&vmi); } - do { + for (; vma; vma = vma_next(&vmi)) { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE); @@ -3491,7 +3491,7 @@ retry_pids: */ if (vma_pids_forced) break; - } for_each_vma(vmi, vma); + } /* * If no VMAs are remaining and VMAs were skipped due to the PID @@ -5280,7 +5280,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * * EEVDF: placement strategy #1 / #2 */ - if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) { + if (sched_feat(PLACE_LAG) && cfs_rq->nr_running && se->vlag) { struct sched_entity *curr = cfs_rq->curr; unsigned long load; @@ -5625,8 +5625,9 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) struct sched_entity *se = pick_eevdf(cfs_rq); if (se->sched_delayed) { dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); - SCHED_WARN_ON(se->sched_delayed); - SCHED_WARN_ON(se->on_rq); + /* + * Must not reference @se again, see __block_task(). + */ return NULL; } return se; @@ -5677,15 +5678,9 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * validating it and just reschedule. */ if (queued) { - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); return; } - /* - * don't let the period tick interfere with the hrtick preemption - */ - if (!sched_feat(DOUBLE_TICK) && - hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) - return; #endif } @@ -6058,10 +6053,13 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); - if (se->on_rq) { - SCHED_WARN_ON(se->sched_delayed); + /* Handle any unfinished DELAY_DEQUEUE business first. */ + if (se->sched_delayed) { + int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED; + + dequeue_entity(qcfs_rq, se, flags); + } else if (se->on_rq) break; - } enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); if (cfs_rq_is_idle(group_cfs_rq(se))) @@ -6818,7 +6816,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) s64 delta = slice - ran; if (delta < 0) { - if (task_current(rq, p)) + if (task_current_donor(rq, p)) resched_curr(rq); return; } @@ -6833,12 +6831,12 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) */ static void hrtick_update(struct rq *rq) { - struct task_struct *curr = rq->curr; + struct task_struct *donor = rq->donor; - if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class) + if (!hrtick_enabled_fair(rq) || donor->sched_class != &fair_sched_class) return; - hrtick_start_fair(rq, curr); + hrtick_start_fair(rq, donor); } #else /* !CONFIG_SCHED_HRTICK */ static inline void @@ -7173,7 +7171,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) /* Fix-up what dequeue_task_fair() skipped */ hrtick_update(rq); - /* Fix-up what block_task() skipped. */ + /* + * Fix-up what block_task() skipped. + * + * Must be last, @p might not be valid after this. + */ __block_task(rq, p); } @@ -7190,12 +7192,14 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE)))) util_est_dequeue(&rq->cfs, p); - if (dequeue_entities(rq, &p->se, flags) < 0) { - util_est_update(&rq->cfs, p, DEQUEUE_SLEEP); + util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); + if (dequeue_entities(rq, &p->se, flags) < 0) return false; - } - util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); + /* + * Must not reference @p after dequeue_entities(DEQUEUE_DELAYED). + */ + hrtick_update(rq); return true; } @@ -8753,9 +8757,9 @@ static void set_next_buddy(struct sched_entity *se) */ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) { - struct task_struct *curr = rq->curr; - struct sched_entity *se = &curr->se, *pse = &p->se; - struct cfs_rq *cfs_rq = task_cfs_rq(curr); + struct task_struct *donor = rq->donor; + struct sched_entity *se = &donor->se, *pse = &p->se; + struct cfs_rq *cfs_rq = task_cfs_rq(donor); int cse_is_idle, pse_is_idle; if (unlikely(se == pse)) @@ -8784,7 +8788,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * prevents us from potentially nominating it as a false LAST_BUDDY * below. */ - if (test_tsk_need_resched(curr)) + if (test_tsk_need_resched(rq->curr)) return; if (!sched_feat(WAKEUP_PREEMPTION)) @@ -8832,7 +8836,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int return; preempt: - resched_curr(rq); + resched_curr_lazy(rq); } static struct task_struct *pick_task_fair(struct rq *rq) @@ -13083,7 +13087,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (task_current(rq, p)) { + if (task_current_donor(rq, p)) { if (p->prio > oldprio) resched_curr(rq); } else @@ -13174,22 +13178,6 @@ static void attach_task_cfs_rq(struct task_struct *p) static void switched_from_fair(struct rq *rq, struct task_struct *p) { detach_task_cfs_rq(p); - /* - * Since this is called after changing class, this is a little weird - * and we cannot use DEQUEUE_DELAYED. - */ - if (p->se.sched_delayed) { - /* First, dequeue it from its new class' structures */ - dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP); - /* - * Now, clean up the fair_sched_class side of things - * related to sched_delayed being true and that wasn't done - * due to the generic dequeue not using DEQUEUE_DELAYED. - */ - finish_delayed_dequeue_entity(&p->se); - p->se.rel_deadline = 0; - __block_task(rq, p); - } } static void switched_to_fair(struct rq *rq, struct task_struct *p) @@ -13206,7 +13194,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) * kick off the schedule if running, otherwise just see * if we can still preempt the current task. */ - if (task_current(rq, p)) + if (task_current_donor(rq, p)) resched_curr(rq); else wakeup_preempt(rq, p, 0); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 290874079f60..a3d331dd2d8f 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -19,7 +19,7 @@ SCHED_FEAT(PLACE_REL_DEADLINE, true) */ SCHED_FEAT(RUN_TO_PARITY, true) /* - * Allow wakeup of tasks with a shorter slice to cancel RESPECT_SLICE for + * Allow wakeup of tasks with a shorter slice to cancel RUN_TO_PARITY for * current. */ SCHED_FEAT(PREEMPT_SHORT, true) @@ -56,7 +56,6 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true) SCHED_FEAT(HRTICK, false) SCHED_FEAT(HRTICK_DL, false) -SCHED_FEAT(DOUBLE_TICK, false) /* * Decrement CPU capacity based on time not spent running tasks diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index d2f096bb274c..621696269584 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -271,7 +271,6 @@ static void do_idle(void) tick_nohz_idle_enter(); while (!need_resched()) { - rmb(); /* * Interrupts shouldn't be re-enabled from that point on until @@ -399,8 +398,8 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns) cpuidle_use_deepest_state(latency_ns); it.done = 0; - hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - it.timer.function = idle_inject_timer_fn; + hrtimer_setup_on_stack(&it.timer, idle_inject_timer_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_HARD); hrtimer_start(&it.timer, ns_to_ktime(duration_ns), HRTIMER_MODE_REL_PINNED_HARD); diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index a9c65d97b3ca..fc07382361a8 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -476,7 +476,7 @@ int update_irq_load_avg(struct rq *rq, u64 running) bool update_other_load_avgs(struct rq *rq) { u64 now = rq_clock_pelt(rq); - const struct sched_class *curr_class = rq->curr->sched_class; + const struct sched_class *curr_class = rq->donor->sched_class; unsigned long hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); lockdep_assert_rq_held(rq); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 020d58967d4e..84dad1511d1e 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -769,12 +769,13 @@ static void record_times(struct psi_group_cpu *groupc, u64 now) } static void psi_group_change(struct psi_group *group, int cpu, - unsigned int clear, unsigned int set, u64 now, + unsigned int clear, unsigned int set, bool wake_clock) { struct psi_group_cpu *groupc; unsigned int t, m; u32 state_mask; + u64 now; lockdep_assert_rq_held(cpu_rq(cpu)); groupc = per_cpu_ptr(group->pcpu, cpu); @@ -789,6 +790,7 @@ static void psi_group_change(struct psi_group *group, int cpu, * SOME and FULL time these may have resulted in. */ write_seqcount_begin(&groupc->seq); + now = cpu_clock(cpu); /* * Start with TSK_ONCPU, which doesn't have a corresponding @@ -899,18 +901,15 @@ void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); struct psi_group *group; - u64 now; if (!task->pid) return; psi_flags_change(task, clear, set); - now = cpu_clock(cpu); - group = task_psi_group(task); do { - psi_group_change(group, cpu, clear, set, now, true); + psi_group_change(group, cpu, clear, set, true); } while ((group = group->parent)); } @@ -919,7 +918,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, { struct psi_group *group, *common = NULL; int cpu = task_cpu(prev); - u64 now = cpu_clock(cpu); if (next->pid) { psi_flags_change(next, 0, TSK_ONCPU); @@ -936,7 +934,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, break; } - psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); + psi_group_change(group, cpu, 0, TSK_ONCPU, true); } while ((group = group->parent)); } @@ -974,7 +972,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, do { if (group == common) break; - psi_group_change(group, cpu, clear, set, now, wake_clock); + psi_group_change(group, cpu, clear, set, wake_clock); } while ((group = group->parent)); /* @@ -986,7 +984,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { clear &= ~TSK_ONCPU; for (; group; group = group->parent) - psi_group_change(group, cpu, clear, set, now, wake_clock); + psi_group_change(group, cpu, clear, set, wake_clock); } } } @@ -997,8 +995,8 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st int cpu = task_cpu(curr); struct psi_group *group; struct psi_group_cpu *groupc; - u64 now, irq; s64 delta; + u64 irq; if (static_branch_likely(&psi_disabled)) return; @@ -1011,7 +1009,6 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st if (prev && task_psi_group(prev) == group) return; - now = cpu_clock(cpu); irq = irq_time_read(cpu); delta = (s64)(irq - rq->psi_irq_time); if (delta < 0) @@ -1019,12 +1016,15 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st rq->psi_irq_time = irq; do { + u64 now; + if (!group->enabled) continue; groupc = per_cpu_ptr(group->pcpu, cpu); write_seqcount_begin(&groupc->seq); + now = cpu_clock(cpu); record_times(groupc, now); groupc->times[PSI_IRQ_FULL] += delta; @@ -1223,11 +1223,9 @@ void psi_cgroup_restart(struct psi_group *group) for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); struct rq_flags rf; - u64 now; rq_lock_irq(rq, &rf); - now = cpu_clock(cpu); - psi_group_change(group, cpu, 0, 0, now, true); + psi_group_change(group, cpu, 0, 0, true); rq_unlock_irq(rq, &rf); } } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 172c588de542..bd66a46b06ac 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -528,7 +528,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { - struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; + struct task_struct *donor = rq_of_rt_rq(rt_rq)->donor; struct rq *rq = rq_of_rt_rq(rt_rq); struct sched_rt_entity *rt_se; @@ -542,7 +542,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) else if (!on_rt_rq(rt_se)) enqueue_rt_entity(rt_se, 0); - if (rt_rq->highest_prio.curr < curr->prio) + if (rt_rq->highest_prio.curr < donor->prio) resched_curr(rq); } } @@ -988,10 +988,10 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se) */ static void update_curr_rt(struct rq *rq) { - struct task_struct *curr = rq->curr; + struct task_struct *donor = rq->donor; s64 delta_exec; - if (curr->sched_class != &rt_sched_class) + if (donor->sched_class != &rt_sched_class) return; delta_exec = update_curr_common(rq); @@ -999,7 +999,7 @@ static void update_curr_rt(struct rq *rq) return; #ifdef CONFIG_RT_GROUP_SCHED - struct sched_rt_entity *rt_se = &curr->rt; + struct sched_rt_entity *rt_se = &donor->rt; if (!rt_bandwidth_enabled()) return; @@ -1535,7 +1535,7 @@ static int find_lowest_rq(struct task_struct *task); static int select_task_rq_rt(struct task_struct *p, int cpu, int flags) { - struct task_struct *curr; + struct task_struct *curr, *donor; struct rq *rq; bool test; @@ -1547,6 +1547,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags) rcu_read_lock(); curr = READ_ONCE(rq->curr); /* unlocked access */ + donor = READ_ONCE(rq->donor); /* * If the current task on @p's runqueue is an RT task, then @@ -1575,8 +1576,8 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags) * systems like big.LITTLE. */ test = curr && - unlikely(rt_task(curr)) && - (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio); + unlikely(rt_task(donor)) && + (curr->nr_cpus_allowed < 2 || donor->prio <= p->prio); if (test || !rt_task_fits_capacity(p, cpu)) { int target = find_lowest_rq(p); @@ -1606,12 +1607,8 @@ out: static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - /* - * Current can't be migrated, useless to reschedule, - * let's hope p can move out. - */ if (rq->curr->nr_cpus_allowed == 1 || - !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) + !cpupri_find(&rq->rd->cpupri, rq->donor, NULL)) return; /* @@ -1654,7 +1651,9 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) */ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) { - if (p->prio < rq->curr->prio) { + struct task_struct *donor = rq->donor; + + if (p->prio < donor->prio) { resched_curr(rq); return; } @@ -1672,7 +1671,7 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) * to move current somewhere else, making room for our non-migratable * task. */ - if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) + if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr)) check_preempt_equal_prio(rq, p); #endif } @@ -1697,7 +1696,7 @@ static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool f * utilization. We only care of the case where we start to schedule a * rt task */ - if (rq->curr->sched_class != &rt_sched_class) + if (rq->donor->sched_class != &rt_sched_class) update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); rt_queue_push_tasks(rq); @@ -1773,15 +1772,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_s /* Only try algorithms three times */ #define RT_MAX_TRIES 3 -static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) -{ - if (!task_on_cpu(rq, p) && - cpumask_test_cpu(cpu, &p->cpus_mask)) - return 1; - - return 0; -} - /* * Return the highest pushable rq's task, which is suitable to be executed * on the CPU, NULL otherwise @@ -1795,7 +1785,7 @@ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) return NULL; plist_for_each_entry(p, head, pushable_tasks) { - if (pick_rt_task(rq, p, cpu)) + if (task_is_pushable(rq, p, cpu)) return p; } @@ -1968,6 +1958,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); + BUG_ON(task_current_donor(rq, p)); BUG_ON(p->nr_cpus_allowed <= 1); BUG_ON(!task_on_rq_queued(p)); @@ -2000,7 +1991,7 @@ retry: * higher priority than current. If that's the case * just reschedule current. */ - if (unlikely(next_task->prio < rq->curr->prio)) { + if (unlikely(next_task->prio < rq->donor->prio)) { resched_curr(rq); return 0; } @@ -2021,7 +2012,7 @@ retry: * Note that the stoppers are masqueraded as SCHED_FIFO * (cf. sched_set_stop_task()), so we can't rely on rt_task(). */ - if (rq->curr->sched_class != &rt_sched_class) + if (rq->donor->sched_class != &rt_sched_class) return 0; cpu = find_lowest_rq(rq->curr); @@ -2088,9 +2079,7 @@ retry: goto retry; } - deactivate_task(rq, next_task, 0); - set_task_cpu(next_task, lowest_rq->cpu); - activate_task(lowest_rq, next_task, 0); + move_queued_task_locked(rq, lowest_rq, next_task); resched_curr(lowest_rq); ret = 1; @@ -2355,15 +2344,13 @@ static void pull_rt_task(struct rq *this_rq) * p if it is lower in priority than the * current task on the run queue */ - if (p->prio < src_rq->curr->prio) + if (p->prio < src_rq->donor->prio) goto skip; if (is_migration_disabled(p)) { push_task = get_push_task(src_rq); } else { - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); + move_queued_task_locked(src_rq, this_rq, p); resched = true; } /* @@ -2399,9 +2386,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) bool need_to_push = !task_on_cpu(rq, p) && !test_tsk_need_resched(rq->curr) && p->nr_cpus_allowed > 1 && - (dl_task(rq->curr) || rt_task(rq->curr)) && + (dl_task(rq->donor) || rt_task(rq->donor)) && (rq->curr->nr_cpus_allowed < 2 || - rq->curr->prio <= p->prio); + rq->donor->prio <= p->prio); if (need_to_push) push_rt_tasks(rq); @@ -2485,7 +2472,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) rt_queue_push_tasks(rq); #endif /* CONFIG_SMP */ - if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) + if (p->prio < rq->donor->prio && cpu_online(cpu_of(rq))) resched_curr(rq); } } @@ -2500,7 +2487,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) if (!task_on_rq_queued(p)) return; - if (task_current(rq, p)) { + if (task_current_donor(rq, p)) { #ifdef CONFIG_SMP /* * If our priority decreases while running, we @@ -2526,7 +2513,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) * greater than the current running task * then reschedule. */ - if (p->prio < rq->curr->prio) + if (p->prio < rq->donor->prio) resched_curr(rq); } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8063db62b027..76f5f53a645f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -432,16 +432,17 @@ struct cfs_bandwidth { struct task_group { struct cgroup_subsys_state css; +#ifdef CONFIG_GROUP_SCHED_WEIGHT + /* A positive value indicates that this is a SCHED_IDLE group. */ + int idle; +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED /* schedulable entities of this group on each CPU */ struct sched_entity **se; /* runqueue "owned" by this group on each CPU */ struct cfs_rq **cfs_rq; unsigned long shares; - - /* A positive value indicates that this is a SCHED_IDLE group. */ - int idle; - #ifdef CONFIG_SMP /* * load_avg can be heavily contended at clock tick time, so put @@ -582,6 +583,7 @@ static inline void set_task_rq_fair(struct sched_entity *se, #endif /* CONFIG_SMP */ #else /* !CONFIG_FAIR_GROUP_SCHED */ static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; } +static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; } #endif /* CONFIG_FAIR_GROUP_SCHED */ #else /* CONFIG_CGROUP_SCHED */ @@ -749,8 +751,9 @@ enum scx_rq_flags { */ SCX_RQ_ONLINE = 1 << 0, SCX_RQ_CAN_STOP_TICK = 1 << 1, - SCX_RQ_BAL_KEEP = 1 << 2, /* balance decided to keep current */ - SCX_RQ_BYPASSING = 1 << 3, + SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */ + SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */ + SCX_RQ_BYPASSING = 1 << 4, SCX_RQ_IN_WAKEUP = 1 << 16, SCX_RQ_IN_BALANCE = 1 << 17, @@ -1145,7 +1148,10 @@ struct rq { */ unsigned int nr_uninterruptible; - struct task_struct __rcu *curr; + union { + struct task_struct __rcu *donor; /* Scheduler context */ + struct task_struct __rcu *curr; /* Execution context */ + }; struct sched_dl_entity *dl_server; struct task_struct *idle; struct task_struct *stop; @@ -1342,6 +1348,11 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) +static inline void rq_set_donor(struct rq *rq, struct task_struct *t) +{ + /* Do nothing */ +} + #ifdef CONFIG_SCHED_CORE static inline struct cpumask *sched_group_span(struct sched_group *sg); @@ -2083,34 +2094,6 @@ static inline const struct cpumask *task_user_cpus(struct task_struct *p) #endif /* CONFIG_SMP */ -#include "stats.h" - -#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) - -extern void __sched_core_account_forceidle(struct rq *rq); - -static inline void sched_core_account_forceidle(struct rq *rq) -{ - if (schedstat_enabled()) - __sched_core_account_forceidle(rq); -} - -extern void __sched_core_tick(struct rq *rq); - -static inline void sched_core_tick(struct rq *rq) -{ - if (sched_core_enabled(rq) && schedstat_enabled()) - __sched_core_tick(rq); -} - -#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */ - -static inline void sched_core_account_forceidle(struct rq *rq) { } - -static inline void sched_core_tick(struct rq *rq) { } - -#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */ - #ifdef CONFIG_CGROUP_SCHED /* @@ -2258,11 +2241,25 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } +/* + * Is p the current execution context? + */ static inline int task_current(struct rq *rq, struct task_struct *p) { return rq->curr == p; } +/* + * Is p the current scheduling context? + * + * Note that it might be the current execution context at the same time if + * rq->curr == rq->donor == p. + */ +static inline int task_current_donor(struct rq *rq, struct task_struct *p) +{ + return rq->donor == p; +} + static inline int task_on_cpu(struct rq *rq, struct task_struct *p) { #ifdef CONFIG_SMP @@ -2290,6 +2287,7 @@ static inline int task_on_rq_migrating(struct task_struct *p) #define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ #define WF_MIGRATED 0x20 /* Internal use, task got migrated */ #define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ +#define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */ #ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); @@ -2332,6 +2330,7 @@ extern const u32 sched_prio_to_wmult[40]; * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) * ENQUEUE_MIGRATED - the task was migrated during wakeup + * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called * */ @@ -2358,6 +2357,7 @@ extern const u32 sched_prio_to_wmult[40]; #define ENQUEUE_INITIAL 0x80 #define ENQUEUE_MIGRATING 0x100 #define ENQUEUE_DELAYED 0x200 +#define ENQUEUE_RQ_SELECTED 0x400 #define RETRY_TASK ((void *)-1UL) @@ -2446,7 +2446,7 @@ struct sched_class { static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { - WARN_ON_ONCE(rq->curr != prev); + WARN_ON_ONCE(rq->donor != prev); prev->sched_class->put_prev_task(rq, prev, NULL); } @@ -2610,7 +2610,7 @@ static inline cpumask_t *alloc_user_cpus_ptr(int node) static inline struct task_struct *get_push_task(struct rq *rq) { - struct task_struct *p = rq->curr; + struct task_struct *p = rq->donor; lockdep_assert_rq_held(rq); @@ -2690,6 +2690,7 @@ extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); extern void resched_curr(struct rq *rq); +extern void resched_curr_lazy(struct rq *rq); extern void resched_cpu(int cpu); extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); @@ -2764,8 +2765,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) static inline void __block_task(struct rq *rq, struct task_struct *p) { - WRITE_ONCE(p->on_rq, 0); - ASSERT_EXCLUSIVE_WRITER(p->on_rq); if (p->sched_contributes_to_load) rq->nr_uninterruptible++; @@ -2773,6 +2772,38 @@ static inline void __block_task(struct rq *rq, struct task_struct *p) atomic_inc(&rq->nr_iowait); delayacct_blkio_start(); } + + ASSERT_EXCLUSIVE_WRITER(p->on_rq); + + /* + * The moment this write goes through, ttwu() can swoop in and migrate + * this task, rendering our rq->__lock ineffective. + * + * __schedule() try_to_wake_up() + * LOCK rq->__lock LOCK p->pi_lock + * pick_next_task() + * pick_next_task_fair() + * pick_next_entity() + * dequeue_entities() + * __block_task() + * RELEASE p->on_rq = 0 if (p->on_rq && ...) + * break; + * + * ACQUIRE (after ctrl-dep) + * + * cpu = select_task_rq(); + * set_task_cpu(p, cpu); + * ttwu_queue() + * ttwu_do_activate() + * LOCK rq->__lock + * activate_task() + * STORE p->on_rq = 1 + * UNLOCK rq->__lock + * + * Callers must ensure to not reference @p after this -- we no longer + * own it. + */ + smp_store_release(&p->on_rq, 0); } extern void activate_task(struct rq *rq, struct task_struct *p, int flags); @@ -3164,6 +3195,34 @@ extern void nohz_run_idle_balance(int cpu); static inline void nohz_run_idle_balance(int cpu) { } #endif +#include "stats.h" + +#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) + +extern void __sched_core_account_forceidle(struct rq *rq); + +static inline void sched_core_account_forceidle(struct rq *rq) +{ + if (schedstat_enabled()) + __sched_core_account_forceidle(rq); +} + +extern void __sched_core_tick(struct rq *rq); + +static inline void sched_core_tick(struct rq *rq) +{ + if (sched_core_enabled(rq) && schedstat_enabled()) + __sched_core_tick(rq); +} + +#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */ + +static inline void sched_core_account_forceidle(struct rq *rq) { } + +static inline void sched_core_tick(struct rq *rq) { } + +#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */ + #ifdef CONFIG_IRQ_TIME_ACCOUNTING struct irqtime { @@ -3594,24 +3653,41 @@ static inline void mm_cid_put(struct mm_struct *mm) __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); } -static inline int __mm_cid_try_get(struct mm_struct *mm) +static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) { - struct cpumask *cpumask; - int cid; + struct cpumask *cidmask = mm_cidmask(mm); + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; + int cid = __this_cpu_read(pcpu_cid->recent_cid); - cpumask = mm_cidmask(mm); + /* Try to re-use recent cid. This improves cache locality. */ + if (!mm_cid_is_unset(cid) && !cpumask_test_and_set_cpu(cid, cidmask)) + return cid; + /* + * Expand cid allocation if the maximum number of concurrency + * IDs allocated (max_nr_cid) is below the number cpus allowed + * and number of threads. Expanding cid allocation as much as + * possible improves cache locality. + */ + cid = atomic_read(&mm->max_nr_cid); + while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) { + if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1)) + continue; + if (!cpumask_test_and_set_cpu(cid, cidmask)) + return cid; + } /* + * Find the first available concurrency id. * Retry finding first zero bit if the mask is temporarily * filled. This only happens during concurrent remote-clear * which owns a cid without holding a rq lock. */ for (;;) { - cid = cpumask_first_zero(cpumask); - if (cid < nr_cpu_ids) + cid = cpumask_first_zero(cidmask); + if (cid < READ_ONCE(mm->nr_cpus_allowed)) break; cpu_relax(); } - if (cpumask_test_and_set_cpu(cid, cpumask)) + if (cpumask_test_and_set_cpu(cid, cidmask)) return -1; return cid; @@ -3629,7 +3705,8 @@ static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm) WRITE_ONCE(pcpu_cid->time, rq->clock); } -static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) +static inline int __mm_cid_get(struct rq *rq, struct task_struct *t, + struct mm_struct *mm) { int cid; @@ -3639,13 +3716,13 @@ static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) * guarantee forward progress. */ if (!READ_ONCE(use_cid_lock)) { - cid = __mm_cid_try_get(mm); + cid = __mm_cid_try_get(t, mm); if (cid >= 0) goto end; raw_spin_lock(&cid_lock); } else { raw_spin_lock(&cid_lock); - cid = __mm_cid_try_get(mm); + cid = __mm_cid_try_get(t, mm); if (cid >= 0) goto unlock; } @@ -3665,7 +3742,7 @@ static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) * all newcoming allocations observe the use_cid_lock flag set. */ do { - cid = __mm_cid_try_get(mm); + cid = __mm_cid_try_get(t, mm); cpu_relax(); } while (cid < 0); /* @@ -3682,7 +3759,8 @@ end: return cid; } -static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm) +static inline int mm_cid_get(struct rq *rq, struct task_struct *t, + struct mm_struct *mm) { struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; struct cpumask *cpumask; @@ -3699,8 +3777,9 @@ static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm) if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); } - cid = __mm_cid_get(rq, mm); + cid = __mm_cid_get(rq, t, mm); __this_cpu_write(pcpu_cid->cid, cid); + __this_cpu_write(pcpu_cid->recent_cid, cid); return cid; } @@ -3753,7 +3832,7 @@ static inline void switch_mm_cid(struct rq *rq, prev->mm_cid = -1; } if (next->mm_cid_active) - next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm); + next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm); } #else /* !CONFIG_SCHED_MM_CID: */ @@ -3766,6 +3845,28 @@ static inline void init_sched_mm_cid(struct task_struct *t) { } extern u64 avg_vruntime(struct cfs_rq *cfs_rq); extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); +#ifdef CONFIG_SMP +static inline +void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task) +{ + lockdep_assert_rq_held(src_rq); + lockdep_assert_rq_held(dst_rq); + + deactivate_task(src_rq, task, 0); + set_task_cpu(task, dst_rq->cpu); + activate_task(dst_rq, task, 0); +} + +static inline +bool task_is_pushable(struct rq *rq, struct task_struct *p, int cpu) +{ + if (!task_on_cpu(rq, p) && + cpumask_test_cpu(cpu, &p->cpus_mask)) + return true; + + return false; +} +#endif #ifdef CONFIG_RT_MUTEXES @@ -3795,7 +3896,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio) extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi); extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx); -extern void __setscheduler_prio(struct task_struct *p, int prio); +extern const struct sched_class *__setscheduler_class(int policy, int prio); extern void set_load_weight(struct task_struct *p, bool update_load); extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 237780aa3c53..8ee0add5a48a 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -119,44 +119,71 @@ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, /* * PSI tracks state that persists across sleeps, such as iowaits and * memory stalls. As a result, it has to distinguish between sleeps, - * where a task's runnable state changes, and requeues, where a task - * and its state are being moved between CPUs and runqueues. + * where a task's runnable state changes, and migrations, where a task + * and its runnable state are being moved between CPUs and runqueues. + * + * A notable case is a task whose dequeue is delayed. PSI considers + * those sleeping, but because they are still on the runqueue they can + * go through migration requeues. In this case, *sleeping* states need + * to be transferred. */ -static inline void psi_enqueue(struct task_struct *p, bool wakeup) +static inline void psi_enqueue(struct task_struct *p, int flags) { - int clear = 0, set = TSK_RUNNING; + int clear = 0, set = 0; if (static_branch_likely(&psi_disabled)) return; - if (p->in_memstall) - set |= TSK_MEMSTALL_RUNNING; + /* Same runqueue, nothing changed for psi */ + if (flags & ENQUEUE_RESTORE) + return; - if (!wakeup) { + if (p->se.sched_delayed) { + /* CPU migration of "sleeping" task */ + SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED)); if (p->in_memstall) set |= TSK_MEMSTALL; + if (p->in_iowait) + set |= TSK_IOWAIT; + } else if (flags & ENQUEUE_MIGRATED) { + /* CPU migration of runnable task */ + set = TSK_RUNNING; + if (p->in_memstall) + set |= TSK_MEMSTALL | TSK_MEMSTALL_RUNNING; } else { + /* Wakeup of new or sleeping task */ if (p->in_iowait) clear |= TSK_IOWAIT; + set = TSK_RUNNING; + if (p->in_memstall) + set |= TSK_MEMSTALL_RUNNING; } psi_task_change(p, clear, set); } -static inline void psi_dequeue(struct task_struct *p, bool sleep) +static inline void psi_dequeue(struct task_struct *p, int flags) { if (static_branch_likely(&psi_disabled)) return; + /* Same runqueue, nothing changed for psi */ + if (flags & DEQUEUE_SAVE) + return; + /* * A voluntary sleep is a dequeue followed by a task switch. To * avoid walking all ancestors twice, psi_task_switch() handles * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU. * Do nothing here. */ - if (sleep) + if (flags & DEQUEUE_SLEEP) return; + /* + * When migrating a task to another CPU, clear all psi + * state. The enqueue callback above will work it out. + */ psi_task_change(p, p->psi_flags, 0); } @@ -190,8 +217,8 @@ static inline void psi_sched_switch(struct task_struct *prev, } #else /* CONFIG_PSI */ -static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} -static inline void psi_dequeue(struct task_struct *p, bool sleep) {} +static inline void psi_enqueue(struct task_struct *p, bool migrate) {} +static inline void psi_dequeue(struct task_struct *p, bool migrate) {} static inline void psi_ttwu_dequeue(struct task_struct *p) {} static inline void psi_sched_switch(struct task_struct *prev, struct task_struct *next, diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index aa70beee9895..0d71fcbaf1e3 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -91,7 +91,7 @@ void set_user_nice(struct task_struct *p, long nice) } queued = task_on_rq_queued(p); - running = task_current(rq, p); + running = task_current_donor(rq, p); if (queued) dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); if (running) @@ -529,7 +529,7 @@ int __sched_setscheduler(struct task_struct *p, { int oldpolicy = -1, policy = attr->sched_policy; int retval, oldprio, newprio, queued, running; - const struct sched_class *prev_class; + const struct sched_class *prev_class, *next_class; struct balance_callback *head; struct rq_flags rf; int reset_on_fork; @@ -706,18 +706,23 @@ change: queue_flags &= ~DEQUEUE_MOVE; } + prev_class = p->sched_class; + next_class = __setscheduler_class(policy, newprio); + + if (prev_class != next_class && p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); + queued = task_on_rq_queued(p); - running = task_current(rq, p); + running = task_current_donor(rq, p); if (queued) dequeue_task(rq, p, queue_flags); if (running) put_prev_task(rq, p); - prev_class = p->sched_class; - if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { __setscheduler_params(p, attr); - __setscheduler_prio(p, newprio); + p->sched_class = next_class; + p->prio = newprio; } __setscheduler_uclamp(p, attr); check_class_changing(rq, p, prev_class); @@ -1076,45 +1081,6 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; } -/* - * Copy the kernel size attribute structure (which might be larger - * than what user-space knows about) to user-space. - * - * Note that all cases are valid: user-space buffer can be larger or - * smaller than the kernel-space buffer. The usual case is that both - * have the same size. - */ -static int -sched_attr_copy_to_user(struct sched_attr __user *uattr, - struct sched_attr *kattr, - unsigned int usize) -{ - unsigned int ksize = sizeof(*kattr); - - if (!access_ok(uattr, usize)) - return -EFAULT; - - /* - * sched_getattr() ABI forwards and backwards compatibility: - * - * If usize == ksize then we just copy everything to user-space and all is good. - * - * If usize < ksize then we only copy as much as user-space has space for, - * this keeps ABI compatibility as well. We skip the rest. - * - * If usize > ksize then user-space is using a newer version of the ABI, - * which part the kernel doesn't know about. Just ignore it - tooling can - * detect the kernel's knowledge of attributes from the attr->size value - * which is set to ksize in this case. - */ - kattr->size = min(usize, ksize); - - if (copy_to_user(uattr, kattr, kattr->size)) - return -EFAULT; - - return 0; -} - /** * sys_sched_getattr - similar to sched_getparam, but with sched_attr * @pid: the pid in question. @@ -1159,7 +1125,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, #endif } - return sched_attr_copy_to_user(uattr, &kattr, usize); + kattr.size = min(usize, sizeof(kattr)); + return copy_struct_to_user(uattr, usize, &kattr, sizeof(kattr), NULL); } #ifdef CONFIG_SMP diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 134d7112ef71..b410b61cec95 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -9,7 +9,7 @@ static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; -wait_queue_head_t *bit_waitqueue(void *word, int bit) +wait_queue_head_t *bit_waitqueue(unsigned long *word, int bit) { const int shift = BITS_PER_LONG == 32 ? 5 : 6; unsigned long val = (unsigned long)word << shift | bit; @@ -55,7 +55,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_ } EXPORT_SYMBOL(__wait_on_bit); -int __sched out_of_line_wait_on_bit(void *word, int bit, +int __sched out_of_line_wait_on_bit(unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode) { struct wait_queue_head *wq_head = bit_waitqueue(word, bit); @@ -66,7 +66,7 @@ int __sched out_of_line_wait_on_bit(void *word, int bit, EXPORT_SYMBOL(out_of_line_wait_on_bit); int __sched out_of_line_wait_on_bit_timeout( - void *word, int bit, wait_bit_action_f *action, + unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode, unsigned long timeout) { struct wait_queue_head *wq_head = bit_waitqueue(word, bit); @@ -108,7 +108,7 @@ __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry } EXPORT_SYMBOL(__wait_on_bit_lock); -int __sched out_of_line_wait_on_bit_lock(void *word, int bit, +int __sched out_of_line_wait_on_bit_lock(unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode) { struct wait_queue_head *wq_head = bit_waitqueue(word, bit); @@ -118,7 +118,7 @@ int __sched out_of_line_wait_on_bit_lock(void *word, int bit, } EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); -void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) +void __wake_up_bit(struct wait_queue_head *wq_head, unsigned long *word, int bit) { struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); @@ -128,23 +128,31 @@ void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) EXPORT_SYMBOL(__wake_up_bit); /** - * wake_up_bit - wake up a waiter on a bit - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on + * wake_up_bit - wake up waiters on a bit + * @word: the address containing the bit being waited on + * @bit: the bit at that address being waited on * - * There is a standard hashed waitqueue table for generic use. This - * is the part of the hash-table's accessor API that wakes up waiters - * on a bit. For instance, if one were to have waiters on a bitflag, - * one would call wake_up_bit() after clearing the bit. + * Wake up any process waiting in wait_on_bit() or similar for the + * given bit to be cleared. * - * In order for this to function properly, as it uses waitqueue_active() - * internally, some kind of memory barrier must be done prior to calling - * this. Typically, this will be smp_mb__after_atomic(), but in some - * cases where bitflags are manipulated non-atomically under a lock, one - * may need to use a less regular barrier, such fs/inode.c's smp_mb(), - * because spin_unlock() does not guarantee a memory barrier. + * The wake-up is sent to tasks in a waitqueue selected by hash from a + * shared pool. Only those tasks on that queue which have requested + * wake_up on this specific address and bit will be woken, and only if the + * bit is clear. + * + * In order for this to function properly there must be a full memory + * barrier after the bit is cleared and before this function is called. + * If the bit was cleared atomically, such as a by clear_bit() then + * smb_mb__after_atomic() can be used, othwewise smb_mb() is needed. + * If the bit was cleared with a fully-ordered operation, no further + * barrier is required. + * + * Normally the bit should be cleared by an operation with RELEASE + * semantics so that any changes to memory made before the bit is + * cleared are guaranteed to be visible after the matching wait_on_bit() + * completes. */ -void wake_up_bit(void *word, int bit) +void wake_up_bit(unsigned long *word, int bit) { __wake_up_bit(bit_waitqueue(word, bit), word, bit); } @@ -188,6 +196,36 @@ void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int } EXPORT_SYMBOL(init_wait_var_entry); +/** + * wake_up_var - wake up waiters on a variable (kernel address) + * @var: the address of the variable being waited on + * + * Wake up any process waiting in wait_var_event() or similar for the + * given variable to change. wait_var_event() can be waiting for an + * arbitrary condition to be true and associates that condition with an + * address. Calling wake_up_var() suggests that the condition has been + * made true, but does not strictly require the condtion to use the + * address given. + * + * The wake-up is sent to tasks in a waitqueue selected by hash from a + * shared pool. Only those tasks on that queue which have requested + * wake_up on this specific address will be woken. + * + * In order for this to function properly there must be a full memory + * barrier after the variable is updated (or more accurately, after the + * condition waited on has been made to be true) and before this function + * is called. If the variable was updated atomically, such as a by + * atomic_dec() then smb_mb__after_atomic() can be used. If the + * variable was updated by a fully ordered operation such as + * atomic_dec_and_test() then no extra barrier is required. Otherwise + * smb_mb() is needed. + * + * Normally the variable should be updated (the condition should be made + * to be true) by an operation with RELEASE semantics such as + * smp_store_release() so that any changes to memory made before the + * variable was updated are guaranteed to be visible after the matching + * wait_var_event() completes. + */ void wake_up_var(void *var) { __wake_up_bit(__var_waitqueue(var), var, -1); @@ -228,20 +266,6 @@ __sched int bit_wait_timeout(struct wait_bit_key *word, int mode) } EXPORT_SYMBOL_GPL(bit_wait_timeout); -__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) -{ - unsigned long now = READ_ONCE(jiffies); - - if (time_after_eq(now, word->timeout)) - return -EAGAIN; - io_schedule_timeout(word->timeout - now); - if (signal_pending_state(mode, current)) - return -EINTR; - - return 0; -} -EXPORT_SYMBOL_GPL(bit_wait_io_timeout); - void __init wait_bit_init(void) { int i; |