summaryrefslogtreecommitdiff
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c227
1 files changed, 111 insertions, 116 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 914096c5b1ae..5ffec4370602 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -799,8 +799,6 @@ void init_entity_runnable_average(struct sched_entity *se)
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
-static void attach_entity_cfs_rq(struct sched_entity *se);
-
/*
* With new tasks being created, their initial util_avgs are extrapolated
* based on the cfs_rq's current util_avg:
@@ -835,20 +833,6 @@ void post_init_entity_util_avg(struct task_struct *p)
long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
- if (cap > 0) {
- if (cfs_rq->avg.util_avg != 0) {
- sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
- sa->util_avg /= (cfs_rq->avg.load_avg + 1);
-
- if (sa->util_avg > cap)
- sa->util_avg = cap;
- } else {
- sa->util_avg = cap;
- }
- }
-
- sa->runnable_avg = sa->util_avg;
-
if (p->sched_class != &fair_sched_class) {
/*
* For !fair tasks do:
@@ -864,7 +848,19 @@ void post_init_entity_util_avg(struct task_struct *p)
return;
}
- attach_entity_cfs_rq(se);
+ if (cap > 0) {
+ if (cfs_rq->avg.util_avg != 0) {
+ sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
+ sa->util_avg /= (cfs_rq->avg.load_avg + 1);
+
+ if (sa->util_avg > cap)
+ sa->util_avg = cap;
+ } else {
+ sa->util_avg = cap;
+ }
+ }
+
+ sa->runnable_avg = sa->util_avg;
}
#else /* !CONFIG_SMP */
@@ -1592,11 +1588,11 @@ numa_type numa_classify(unsigned int imbalance_pct,
#ifdef CONFIG_SCHED_SMT
/* Forward declarations of select_idle_sibling helpers */
-static inline bool test_idle_cores(int cpu, bool def);
+static inline bool test_idle_cores(int cpu);
static inline int numa_idle_core(int idle_core, int cpu)
{
if (!static_branch_likely(&sched_smt_present) ||
- idle_core >= 0 || !test_idle_cores(cpu, false))
+ idle_core >= 0 || !test_idle_cores(cpu))
return idle_core;
/*
@@ -2600,7 +2596,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (!join)
return;
- BUG_ON(irqs_disabled());
+ WARN_ON_ONCE(irqs_disabled());
double_lock_irq(&my_grp->lock, &grp->lock);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
@@ -3838,8 +3834,7 @@ static void migrate_se_pelt_lag(struct sched_entity *se) {}
* @cfs_rq: cfs_rq to update
*
* The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
- * avg. The immediate corollary is that all (fair) tasks must be attached, see
- * post_init_entity_util_avg().
+ * avg. The immediate corollary is that all (fair) tasks must be attached.
*
* cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
*
@@ -4003,6 +3998,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
#define UPDATE_TG 0x1
#define SKIP_AGE_LOAD 0x2
#define DO_ATTACH 0x4
+#define DO_DETACH 0x8
/* Update task and its cfs_rq load average */
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
@@ -4032,6 +4028,13 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq);
+ } else if (flags & DO_DETACH) {
+ /*
+ * DO_DETACH means we're here from dequeue_entity()
+ * and we are migrating task out of the CPU.
+ */
+ detach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq);
} else if (decayed) {
cfs_rq_util_change(cfs_rq, 0);
@@ -4064,8 +4067,8 @@ static void remove_entity_load_avg(struct sched_entity *se)
/*
* tasks cannot exit without having gone through wake_up_new_task() ->
- * post_init_entity_util_avg() which will have added things to the
- * cfs_rq, so we can remove unconditionally.
+ * enqueue_task_fair() which will have added things to the cfs_rq,
+ * so we can remove unconditionally.
*/
sync_entity_load_avg(se);
@@ -4262,7 +4265,7 @@ static inline int task_fits_capacity(struct task_struct *p,
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
{
- if (!static_branch_unlikely(&sched_asym_cpucapacity))
+ if (!sched_asym_cpucap_active())
return;
if (!p || p->nr_cpus_allowed == 1) {
@@ -4292,6 +4295,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
#define UPDATE_TG 0x0
#define SKIP_AGE_LOAD 0x0
#define DO_ATTACH 0x0
+#define DO_DETACH 0x0
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
{
@@ -4434,7 +4438,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/*
* When enqueuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
- * - Add its load to cfs_rq->runnable_avg
+ * - For group_entity, update its runnable_weight to reflect the new
+ * h_nr_running of its group cfs_rq.
* - For group_entity, update its weight to reflect the new share of
* its group cfs_rq
* - Add its new weight to cfs_rq->load.weight
@@ -4511,6 +4516,11 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
+ int action = UPDATE_TG;
+
+ if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
+ action |= DO_DETACH;
+
/*
* Update run-time statistics of the 'current'.
*/
@@ -4519,12 +4529,13 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/*
* When dequeuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
- * - Subtract its load from the cfs_rq->runnable_avg.
+ * - For group_entity, update its runnable_weight to reflect the new
+ * h_nr_running of its group cfs_rq.
* - Subtract its previous weight from cfs_rq->load.weight.
* - For group entity, update its weight to reflect the new share
* of its group cfs_rq.
*/
- update_load_avg(cfs_rq, se, UPDATE_TG);
+ update_load_avg(cfs_rq, se, action);
se_update_runnable(se);
update_stats_dequeue_fair(cfs_rq, se, flags);
@@ -5893,8 +5904,8 @@ dequeue_throttle:
#ifdef CONFIG_SMP
/* Working cpumask for: load_balance, load_balance_newidle. */
-DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
-DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
+static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
#ifdef CONFIG_NO_HZ_COMMON
@@ -6260,7 +6271,7 @@ static inline void set_idle_cores(int cpu, int val)
WRITE_ONCE(sds->has_idle_cores, val);
}
-static inline bool test_idle_cores(int cpu, bool def)
+static inline bool test_idle_cores(int cpu)
{
struct sched_domain_shared *sds;
@@ -6268,7 +6279,7 @@ static inline bool test_idle_cores(int cpu, bool def)
if (sds)
return READ_ONCE(sds->has_idle_cores);
- return def;
+ return false;
}
/*
@@ -6284,7 +6295,7 @@ void __update_idle_core(struct rq *rq)
int cpu;
rcu_read_lock();
- if (test_idle_cores(core, true))
+ if (test_idle_cores(core))
goto unlock;
for_each_cpu(cpu, cpu_smt_mask(core)) {
@@ -6310,9 +6321,6 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
bool idle = true;
int cpu;
- if (!static_branch_likely(&sched_smt_present))
- return __select_idle_cpu(core, p);
-
for_each_cpu(cpu, cpu_smt_mask(core)) {
if (!available_idle_cpu(cpu)) {
idle = false;
@@ -6339,13 +6347,12 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
/*
* Scan the local SMT mask for idle CPUs.
*/
-static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_smt(struct task_struct *p, int target)
{
int cpu;
- for_each_cpu(cpu, cpu_smt_mask(target)) {
- if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
- !cpumask_test_cpu(cpu, sched_domain_span(sd)))
+ for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) {
+ if (cpu == target)
continue;
if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
return cpu;
@@ -6360,9 +6367,9 @@ static inline void set_idle_cores(int cpu, int val)
{
}
-static inline bool test_idle_cores(int cpu, bool def)
+static inline bool test_idle_cores(int cpu)
{
- return def;
+ return false;
}
static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
@@ -6370,7 +6377,7 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma
return __select_idle_cpu(core, p);
}
-static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+static inline int select_idle_smt(struct task_struct *p, int target)
{
return -1;
}
@@ -6389,19 +6396,19 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
struct sched_domain_shared *sd_share;
struct rq *this_rq = this_rq();
int this = smp_processor_id();
- struct sched_domain *this_sd;
+ struct sched_domain *this_sd = NULL;
u64 time = 0;
- this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
- if (!this_sd)
- return -1;
-
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
if (sched_feat(SIS_PROP) && !has_idle_core) {
u64 avg_cost, avg_idle, span_avg;
unsigned long now = jiffies;
+ this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
+ if (!this_sd)
+ return -1;
+
/*
* If we're busy, the assumption that the last idle period
* predicts the future is flawed; age away the remaining
@@ -6455,7 +6462,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
if (has_idle_core)
set_idle_cores(target, false);
- if (sched_feat(SIS_PROP) && !has_idle_core) {
+ if (sched_feat(SIS_PROP) && this_sd && !has_idle_core) {
time = cpu_clock(this) - time;
/*
@@ -6506,7 +6513,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
static inline bool asym_fits_capacity(unsigned long task_util, int cpu)
{
- if (static_branch_unlikely(&sched_asym_cpucapacity))
+ if (sched_asym_cpucap_active())
return fits_capacity(task_util, capacity_of(cpu));
return true;
@@ -6526,7 +6533,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* On asymmetric system, update task utilization because we will check
* that the task fits with cpu's capacity.
*/
- if (static_branch_unlikely(&sched_asym_cpucapacity)) {
+ if (sched_asym_cpucap_active()) {
sync_entity_load_avg(&p->se);
task_util = uclamp_task_util(p);
}
@@ -6580,7 +6587,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* For asymmetric CPU capacity systems, our domain of interest is
* sd_asym_cpucapacity rather than sd_llc.
*/
- if (static_branch_unlikely(&sched_asym_cpucapacity)) {
+ if (sched_asym_cpucap_active()) {
sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
/*
* On an asymmetric CPU capacity system where an exclusive
@@ -6601,10 +6608,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
return target;
if (sched_smt_active()) {
- has_idle_core = test_idle_cores(target, false);
+ has_idle_core = test_idle_cores(target);
if (!has_idle_core && cpus_share_cache(prev, target)) {
- i = select_idle_smt(p, sd, prev);
+ i = select_idle_smt(p, prev);
if ((unsigned int)i < nr_cpumask_bits)
return i;
}
@@ -7076,8 +7083,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
return new_cpu;
}
-static void detach_entity_cfs_rq(struct sched_entity *se);
-
/*
* Called immediately before a task is migrated to a new CPU; task_cpu(p) and
* cfs_rq_of(p) references at time of call are still valid and identify the
@@ -7099,15 +7104,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
}
- if (p->on_rq == TASK_ON_RQ_MIGRATING) {
- /*
- * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
- * rq->lock and can modify state directly.
- */
- lockdep_assert_rq_held(task_rq(p));
- detach_entity_cfs_rq(se);
-
- } else {
+ if (!task_on_rq_migrating(p)) {
remove_entity_load_avg(se);
/*
@@ -7279,7 +7276,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;
find_matching_se(&se, &pse);
- BUG_ON(!pse);
+ WARN_ON_ONCE(!pse);
cse_is_idle = se_is_idle(se);
pse_is_idle = se_is_idle(pse);
@@ -7938,7 +7935,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/* Record that we found at least one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
- if (task_running(env->src_rq, p)) {
+ if (task_on_cpu(env->src_rq, p)) {
schedstat_inc(p->stats.nr_failed_migrations_running);
return 0;
}
@@ -8012,8 +8009,6 @@ static struct task_struct *detach_one_task(struct lb_env *env)
return NULL;
}
-static const unsigned int sched_nr_migrate_break = 32;
-
/*
* detach_tasks() -- tries to detach up to imbalance load/util/tasks from
* busiest_rq, as part of a balancing operation within domain "sd".
@@ -8049,20 +8044,24 @@ static int detach_tasks(struct lb_env *env)
if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
break;
- p = list_last_entry(tasks, struct task_struct, se.group_node);
-
env->loop++;
- /* We've more or less seen every task there is, call it quits */
- if (env->loop > env->loop_max)
+ /*
+ * We've more or less seen every task there is, call it quits
+ * unless we haven't found any movable task yet.
+ */
+ if (env->loop > env->loop_max &&
+ !(env->flags & LBF_ALL_PINNED))
break;
/* take a breather every nr_migrate tasks */
if (env->loop > env->loop_break) {
- env->loop_break += sched_nr_migrate_break;
+ env->loop_break += SCHED_NR_MIGRATE_BREAK;
env->flags |= LBF_NEED_BREAK;
break;
}
+ p = list_last_entry(tasks, struct task_struct, se.group_node);
+
if (!can_migrate_task(p, env))
goto next;
@@ -8159,7 +8158,7 @@ static void attach_task(struct rq *rq, struct task_struct *p)
{
lockdep_assert_rq_held(rq);
- BUG_ON(task_rq(p) != rq);
+ WARN_ON_ONCE(task_rq(p) != rq);
activate_task(rq, p, ENQUEUE_NOCLOCK);
check_preempt_curr(rq, p, 0);
}
@@ -10099,14 +10098,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct rq *busiest;
struct rq_flags rf;
struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
-
struct lb_env env = {
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
.dst_grpmask = sched_group_span(sd->groups),
.idle = idle,
- .loop_break = sched_nr_migrate_break,
+ .loop_break = SCHED_NR_MIGRATE_BREAK,
.cpus = cpus,
.fbq_type = all,
.tasks = LIST_HEAD_INIT(env.tasks),
@@ -10134,7 +10132,7 @@ redo:
goto out_balanced;
}
- BUG_ON(busiest == env.dst_rq);
+ WARN_ON_ONCE(busiest == env.dst_rq);
schedstat_add(sd->lb_imbalance[idle], env.imbalance);
@@ -10182,7 +10180,9 @@ more_balance:
if (env.flags & LBF_NEED_BREAK) {
env.flags &= ~LBF_NEED_BREAK;
- goto more_balance;
+ /* Stop if we tried all running tasks */
+ if (env.loop < busiest->nr_running)
+ goto more_balance;
}
/*
@@ -10213,7 +10213,7 @@ more_balance:
env.dst_cpu = env.new_dst_cpu;
env.flags &= ~LBF_DST_PINNED;
env.loop = 0;
- env.loop_break = sched_nr_migrate_break;
+ env.loop_break = SCHED_NR_MIGRATE_BREAK;
/*
* Go back to "more_balance" rather than "redo" since we
@@ -10245,7 +10245,7 @@ more_balance:
*/
if (!cpumask_subset(cpus, env.dst_grpmask)) {
env.loop = 0;
- env.loop_break = sched_nr_migrate_break;
+ env.loop_break = SCHED_NR_MIGRATE_BREAK;
goto redo;
}
goto out_all_pinned;
@@ -10430,7 +10430,7 @@ static int active_load_balance_cpu_stop(void *data)
* we need to fix it. Originally reported by
* Bjorn Helgaas on a 128-CPU setup.
*/
- BUG_ON(busiest_rq == target_rq);
+ WARN_ON_ONCE(busiest_rq == target_rq);
/* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
@@ -10916,8 +10916,7 @@ static bool update_nohz_stats(struct rq *rq)
* can be a simple update of blocked load or a complete load balance with
* tasks movement depending of flags.
*/
-static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
- enum cpu_idle_type idle)
+static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
{
/* Earliest time when we have to do rebalance again */
unsigned long now = jiffies;
@@ -11032,7 +11031,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
if (idle != CPU_IDLE)
return false;
- _nohz_idle_balance(this_rq, flags, idle);
+ _nohz_idle_balance(this_rq, flags);
return true;
}
@@ -11052,7 +11051,7 @@ void nohz_run_idle_balance(int cpu)
* (ie NOHZ_STATS_KICK set) and will do the same.
*/
if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
- _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK, CPU_IDLE);
+ _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK);
}
static void nohz_newidle_balance(struct rq *this_rq)
@@ -11552,6 +11551,17 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+#ifdef CONFIG_SMP
+ /*
+ * In case the task sched_avg hasn't been attached:
+ * - A forked task which hasn't been woken up by wake_up_new_task().
+ * - A task which has been woken up by try_to_wake_up() but is
+ * waiting for actually being woken up by sched_ttwu_pending().
+ */
+ if (!se->avg.last_update_time)
+ return;
+#endif
+
/* Catch up with the cfs_rq and remove our load when we leave */
update_load_avg(cfs_rq, se, 0);
detach_entity_load_avg(cfs_rq, se);
@@ -11563,14 +11573,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
-#ifdef CONFIG_FAIR_GROUP_SCHED
- /*
- * Since the real-depth could have been changed (only FAIR
- * class maintain depth value), reset depth properly.
- */
- se->depth = se->parent ? se->parent->depth + 1 : 0;
-#endif
-
/* Synchronize entity with its cfs_rq */
update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
attach_entity_load_avg(cfs_rq, se);
@@ -11666,39 +11668,25 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void task_set_group_fair(struct task_struct *p)
+static void task_change_group_fair(struct task_struct *p)
{
- struct sched_entity *se = &p->se;
-
- set_task_rq(p, task_cpu(p));
- se->depth = se->parent ? se->parent->depth + 1 : 0;
-}
+ /*
+ * We couldn't detach or attach a forked task which
+ * hasn't been woken up by wake_up_new_task().
+ */
+ if (READ_ONCE(p->__state) == TASK_NEW)
+ return;
-static void task_move_group_fair(struct task_struct *p)
-{
detach_task_cfs_rq(p);
- set_task_rq(p, task_cpu(p));
#ifdef CONFIG_SMP
/* Tell se's cfs_rq has been changed -- migrated */
p->se.avg.last_update_time = 0;
#endif
+ set_task_rq(p, task_cpu(p));
attach_task_cfs_rq(p);
}
-static void task_change_group_fair(struct task_struct *p, int type)
-{
- switch (type) {
- case TASK_SET_GROUP:
- task_set_group_fair(p);
- break;
-
- case TASK_MOVE_GROUP:
- task_move_group_fair(p);
- break;
- }
-}
-
void free_fair_sched_group(struct task_group *tg)
{
int i;
@@ -12075,6 +12063,13 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
__init void init_sched_fair_class(void)
{
#ifdef CONFIG_SMP
+ int i;
+
+ for_each_possible_cpu(i) {
+ zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
+ zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i));
+ }
+
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
#ifdef CONFIG_NO_HZ_COMMON