summaryrefslogtreecommitdiff
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c1515
1 files changed, 970 insertions, 545 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c95880e216f6..4037e19bbca2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
*
@@ -32,6 +33,7 @@
#include <linux/mempolicy.h>
#include <linux/migrate.h>
#include <linux/task_work.h>
+#include <linux/sched/isolation.h>
#include <trace/events/sched.h>
@@ -513,6 +515,7 @@ static inline int entity_before(struct sched_entity *a,
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
+ struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
u64 vruntime = cfs_rq->min_vruntime;
@@ -523,10 +526,9 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
curr = NULL;
}
- if (cfs_rq->rb_leftmost) {
- struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
- struct sched_entity,
- run_node);
+ if (leftmost) { /* non-empty tree */
+ struct sched_entity *se;
+ se = rb_entry(leftmost, struct sched_entity, run_node);
if (!curr)
vruntime = se->vruntime;
@@ -547,10 +549,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
+ struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
struct rb_node *parent = NULL;
struct sched_entity *entry;
- int leftmost = 1;
+ bool leftmost = true;
/*
* Find the right place in the rbtree:
@@ -566,36 +568,23 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
link = &parent->rb_left;
} else {
link = &parent->rb_right;
- leftmost = 0;
+ leftmost = false;
}
}
- /*
- * Maintain a cache of leftmost tree entries (it is frequently
- * used):
- */
- if (leftmost)
- cfs_rq->rb_leftmost = &se->run_node;
-
rb_link_node(&se->run_node, parent, link);
- rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
+ rb_insert_color_cached(&se->run_node,
+ &cfs_rq->tasks_timeline, leftmost);
}
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (cfs_rq->rb_leftmost == &se->run_node) {
- struct rb_node *next_node;
-
- next_node = rb_next(&se->run_node);
- cfs_rq->rb_leftmost = next_node;
- }
-
- rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
+ rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
}
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
- struct rb_node *left = cfs_rq->rb_leftmost;
+ struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
if (!left)
return NULL;
@@ -616,7 +605,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
#ifdef CONFIG_SCHED_DEBUG
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
- struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
+ struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
if (!last)
return NULL;
@@ -729,13 +718,8 @@ void init_entity_runnable_average(struct sched_entity *se)
{
struct sched_avg *sa = &se->avg;
- sa->last_update_time = 0;
- /*
- * sched_avg's period_contrib should be strictly less then 1024, so
- * we give it 1023 to make sure it is almost a period (1024us), and
- * will definitely be update (after enqueue).
- */
- sa->period_contrib = 1023;
+ memset(sa, 0, sizeof(*sa));
+
/*
* Tasks are intialized with full load to be seen as heavy tasks until
* they get a chance to stabilize to their real load level.
@@ -743,13 +727,10 @@ void init_entity_runnable_average(struct sched_entity *se)
* nothing has been attached to the task group yet.
*/
if (entity_is_task(se))
- sa->load_avg = scale_load_down(se->load.weight);
- sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
- /*
- * At this point, util_avg won't be used in select_task_rq_fair anyway
- */
- sa->util_avg = 0;
- sa->util_sum = 0;
+ sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
+
+ se->runnable_weight = se->load.weight;
+
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
@@ -797,7 +778,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
} else {
sa->util_avg = cap;
}
- sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
}
if (entity_is_task(se)) {
@@ -806,7 +786,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
/*
* For !fair tasks do:
*
- update_cfs_rq_load_avg(now, cfs_rq, false);
+ update_cfs_rq_load_avg(now, cfs_rq);
attach_entity_load_avg(cfs_rq, se);
switched_from_fair(rq, p);
*
@@ -864,7 +844,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
struct task_struct *curtask = task_of(curr);
trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
- cpuacct_charge(curtask, delta_exec);
+ cgroup_account_cputime(curtask, delta_exec);
account_group_exec_runtime(curtask, delta_exec);
}
@@ -1071,6 +1051,29 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
+struct numa_group {
+ atomic_t refcount;
+
+ spinlock_t lock; /* nr_tasks, tasks */
+ int nr_tasks;
+ pid_t gid;
+ int active_nodes;
+
+ struct rcu_head rcu;
+ unsigned long total_faults;
+ unsigned long max_faults_cpu;
+ /*
+ * Faults_cpu is used to decide whether memory should move
+ * towards the CPU. As a consequence, these stats are weighted
+ * more by CPU use than by memory faults.
+ */
+ unsigned long *faults_cpu;
+ unsigned long faults[0];
+};
+
+static inline unsigned long group_faults_priv(struct numa_group *ng);
+static inline unsigned long group_faults_shared(struct numa_group *ng);
+
static unsigned int task_nr_scan_windows(struct task_struct *p)
{
unsigned long rss = 0;
@@ -1107,13 +1110,47 @@ static unsigned int task_scan_min(struct task_struct *p)
return max_t(unsigned int, floor, scan);
}
+static unsigned int task_scan_start(struct task_struct *p)
+{
+ unsigned long smin = task_scan_min(p);
+ unsigned long period = smin;
+
+ /* Scale the maximum scan period with the amount of shared memory. */
+ if (p->numa_group) {
+ struct numa_group *ng = p->numa_group;
+ unsigned long shared = group_faults_shared(ng);
+ unsigned long private = group_faults_priv(ng);
+
+ period *= atomic_read(&ng->refcount);
+ period *= shared + 1;
+ period /= private + shared + 1;
+ }
+
+ return max(smin, period);
+}
+
static unsigned int task_scan_max(struct task_struct *p)
{
- unsigned int smin = task_scan_min(p);
- unsigned int smax;
+ unsigned long smin = task_scan_min(p);
+ unsigned long smax;
/* Watch for min being lower than max due to floor calculations */
smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
+
+ /* Scale the maximum scan period with the amount of shared memory. */
+ if (p->numa_group) {
+ struct numa_group *ng = p->numa_group;
+ unsigned long shared = group_faults_shared(ng);
+ unsigned long private = group_faults_priv(ng);
+ unsigned long period = smax;
+
+ period *= atomic_read(&ng->refcount);
+ period *= shared + 1;
+ period /= private + shared + 1;
+
+ smax = max(smax, period);
+ }
+
return max(smin, smax);
}
@@ -1129,26 +1166,6 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
}
-struct numa_group {
- atomic_t refcount;
-
- spinlock_t lock; /* nr_tasks, tasks */
- int nr_tasks;
- pid_t gid;
- int active_nodes;
-
- struct rcu_head rcu;
- unsigned long total_faults;
- unsigned long max_faults_cpu;
- /*
- * Faults_cpu is used to decide whether memory should move
- * towards the CPU. As a consequence, these stats are weighted
- * more by CPU use than by memory faults.
- */
- unsigned long *faults_cpu;
- unsigned long faults[0];
-};
-
/* Shared or private faults. */
#define NR_NUMA_HINT_FAULT_TYPES 2
@@ -1198,6 +1215,30 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
}
+static inline unsigned long group_faults_priv(struct numa_group *ng)
+{
+ unsigned long faults = 0;
+ int node;
+
+ for_each_online_node(node) {
+ faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
+ }
+
+ return faults;
+}
+
+static inline unsigned long group_faults_shared(struct numa_group *ng)
+{
+ unsigned long faults = 0;
+ int node;
+
+ for_each_online_node(node) {
+ faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
+ }
+
+ return faults;
+}
+
/*
* A node triggering more than 1/3 as many NUMA faults as the maximum is
* considered part of a numa group's pseudo-interleaving set. Migrations
@@ -1378,7 +1419,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
-static unsigned long weighted_cpuload(const int cpu);
+static unsigned long weighted_cpuload(struct rq *rq);
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
static unsigned long capacity_of(int cpu);
@@ -1409,7 +1450,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
struct rq *rq = cpu_rq(cpu);
ns->nr_running += rq->nr_running;
- ns->load += weighted_cpuload(cpu);
+ ns->load += weighted_cpuload(rq);
ns->compute_capacity += capacity_of(cpu);
cpus++;
@@ -1808,7 +1849,7 @@ static int task_numa_migrate(struct task_struct *p)
* Reset the scan period if the task is being rescheduled on an
* alternative node to recheck if the tasks is now properly placed.
*/
- p->numa_scan_period = task_scan_min(p);
+ p->numa_scan_period = task_scan_start(p);
if (env.best_task == NULL) {
ret = migrate_task_to(p, env.best_cpu);
@@ -1892,7 +1933,7 @@ static void update_task_scan_period(struct task_struct *p,
unsigned long shared, unsigned long private)
{
unsigned int period_slot;
- int ratio;
+ int lr_ratio, ps_ratio;
int diff;
unsigned long remote = p->numa_faults_locality[0];
@@ -1922,25 +1963,36 @@ static void update_task_scan_period(struct task_struct *p,
* >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
*/
period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
- ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
- if (ratio >= NUMA_PERIOD_THRESHOLD) {
- int slot = ratio - NUMA_PERIOD_THRESHOLD;
+ lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+ ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
+
+ if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
+ /*
+ * Most memory accesses are local. There is no need to
+ * do fast NUMA scanning, since memory is already local.
+ */
+ int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
+ if (!slot)
+ slot = 1;
+ diff = slot * period_slot;
+ } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
+ /*
+ * Most memory accesses are shared with other tasks.
+ * There is no point in continuing fast NUMA scanning,
+ * since other tasks may just move the memory elsewhere.
+ */
+ int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
if (!slot)
slot = 1;
diff = slot * period_slot;
} else {
- diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
-
/*
- * Scale scan rate increases based on sharing. There is an
- * inverse relationship between the degree of sharing and
- * the adjustment made to the scanning period. Broadly
- * speaking the intent is that there is little point
- * scanning faster if shared accesses dominate as it may
- * simply bounce migrations uselessly
+ * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
+ * yet they are not on the local NUMA node. Speed up
+ * NUMA scanning to get the memory moved over.
*/
- ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
- diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
+ int ratio = max(lr_ratio, ps_ratio);
+ diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
}
p->numa_scan_period = clamp(p->numa_scan_period + diff,
@@ -1966,7 +2018,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
delta = runtime - p->last_sum_exec_runtime;
*period = now - p->last_task_numa_placement;
} else {
- delta = p->se.avg.load_sum / p->se.load.weight;
+ delta = p->se.avg.load_sum;
*period = LOAD_AVG_MAX;
}
@@ -2448,7 +2500,7 @@ void task_numa_work(struct callback_head *work)
if (p->numa_scan_period == 0) {
p->numa_scan_period_max = task_scan_max(p);
- p->numa_scan_period = task_scan_min(p);
+ p->numa_scan_period = task_scan_start(p);
}
next_scan = now + msecs_to_jiffies(p->numa_scan_period);
@@ -2576,7 +2628,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
if (now > curr->node_stamp + period) {
if (!curr->node_stamp)
- curr->numa_scan_period = task_scan_min(curr);
+ curr->numa_scan_period = task_scan_start(curr);
curr->node_stamp += period;
if (!time_before(jiffies, curr->mm->numa_next_scan)) {
@@ -2586,59 +2638,6 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
}
}
-/*
- * Can a task be moved from prev_cpu to this_cpu without causing a load
- * imbalance that would trigger the load balancer?
- */
-static inline bool numa_wake_affine(struct sched_domain *sd,
- struct task_struct *p, int this_cpu,
- int prev_cpu, int sync)
-{
- struct numa_stats prev_load, this_load;
- s64 this_eff_load, prev_eff_load;
-
- update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
- update_numa_stats(&this_load, cpu_to_node(this_cpu));
-
- /*
- * If sync wakeup then subtract the (maximum possible)
- * effect of the currently running task from the load
- * of the current CPU:
- */
- if (sync) {
- unsigned long current_load = task_h_load(current);
-
- if (this_load.load > current_load)
- this_load.load -= current_load;
- else
- this_load.load = 0;
- }
-
- /*
- * In low-load situations, where this_cpu's node is idle due to the
- * sync cause above having dropped this_load.load to 0, move the task.
- * Moving to an idle socket will not create a bad imbalance.
- *
- * Otherwise check if the nodes are near enough in load to allow this
- * task to be woken on this_cpu's node.
- */
- if (this_load.load > 0) {
- unsigned long task_load = task_h_load(p);
-
- this_eff_load = 100;
- this_eff_load *= prev_load.compute_capacity;
-
- prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
- prev_eff_load *= this_load.compute_capacity;
-
- this_eff_load *= this_load.load + task_load;
- prev_eff_load *= prev_load.load - task_load;
-
- return this_eff_load <= prev_eff_load;
- }
-
- return true;
-}
#else
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
@@ -2652,14 +2651,6 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
}
-#ifdef CONFIG_SMP
-static inline bool numa_wake_affine(struct sched_domain *sd,
- struct task_struct *p, int this_cpu,
- int prev_cpu, int sync)
-{
- return true;
-}
-#endif /* !SMP */
#endif /* CONFIG_NUMA_BALANCING */
static void
@@ -2694,18 +2685,226 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->nr_running--;
}
+/*
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(_val) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ \
+ res = var + val; \
+ \
+ if (val < 0 && res > var) \
+ res = 0; \
+ \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
+/*
+ * Unsigned subtract and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define sub_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(*ptr) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ res = var - val; \
+ if (res > var) \
+ res = 0; \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
+#ifdef CONFIG_SMP
+/*
+ * XXX we want to get rid of these helpers and use the full load resolution.
+ */
+static inline long se_weight(struct sched_entity *se)
+{
+ return scale_load_down(se->load.weight);
+}
+
+static inline long se_runnable(struct sched_entity *se)
+{
+ return scale_load_down(se->runnable_weight);
+}
+
+static inline void
+enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ cfs_rq->runnable_weight += se->runnable_weight;
+
+ cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
+ cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
+}
+
+static inline void
+dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ cfs_rq->runnable_weight -= se->runnable_weight;
+
+ sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
+ sub_positive(&cfs_rq->avg.runnable_load_sum,
+ se_runnable(se) * se->avg.runnable_load_sum);
+}
+
+static inline void
+enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ cfs_rq->avg.load_avg += se->avg.load_avg;
+ cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
+}
+
+static inline void
+dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
+ sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
+}
+#else
+static inline void
+enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+static inline void
+dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+static inline void
+enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+static inline void
+dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+#endif
+
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ unsigned long weight, unsigned long runnable)
+{
+ if (se->on_rq) {
+ /* commit outstanding execution time */
+ if (cfs_rq->curr == se)
+ update_curr(cfs_rq);
+ account_entity_dequeue(cfs_rq, se);
+ dequeue_runnable_load_avg(cfs_rq, se);
+ }
+ dequeue_load_avg(cfs_rq, se);
+
+ se->runnable_weight = runnable;
+ update_load_set(&se->load, weight);
+
+#ifdef CONFIG_SMP
+ do {
+ u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
+
+ se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
+ se->avg.runnable_load_avg =
+ div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
+ } while (0);
+#endif
+
+ enqueue_load_avg(cfs_rq, se);
+ if (se->on_rq) {
+ account_entity_enqueue(cfs_rq, se);
+ enqueue_runnable_load_avg(cfs_rq, se);
+ }
+}
+
+void reweight_task(struct task_struct *p, int prio)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ struct load_weight *load = &se->load;
+ unsigned long weight = scale_load(sched_prio_to_weight[prio]);
+
+ reweight_entity(cfs_rq, se, weight, weight);
+ load->inv_weight = sched_prio_to_wmult[prio];
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
# ifdef CONFIG_SMP
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+/*
+ * All this does is approximate the hierarchical proportion which includes that
+ * global sum we all love to hate.
+ *
+ * That is, the weight of a group entity, is the proportional share of the
+ * group weight based on the group runqueue weights. That is:
+ *
+ * tg->weight * grq->load.weight
+ * ge->load.weight = ----------------------------- (1)
+ * \Sum grq->load.weight
+ *
+ * Now, because computing that sum is prohibitively expensive to compute (been
+ * there, done that) we approximate it with this average stuff. The average
+ * moves slower and therefore the approximation is cheaper and more stable.
+ *
+ * So instead of the above, we substitute:
+ *
+ * grq->load.weight -> grq->avg.load_avg (2)
+ *
+ * which yields the following:
+ *
+ * tg->weight * grq->avg.load_avg
+ * ge->load.weight = ------------------------------ (3)
+ * tg->load_avg
+ *
+ * Where: tg->load_avg ~= \Sum grq->avg.load_avg
+ *
+ * That is shares_avg, and it is right (given the approximation (2)).
+ *
+ * The problem with it is that because the average is slow -- it was designed
+ * to be exactly that of course -- this leads to transients in boundary
+ * conditions. In specific, the case where the group was idle and we start the
+ * one task. It takes time for our CPU's grq->avg.load_avg to build up,
+ * yielding bad latency etc..
+ *
+ * Now, in that special case (1) reduces to:
+ *
+ * tg->weight * grq->load.weight
+ * ge->load.weight = ----------------------------- = tg->weight (4)
+ * grp->load.weight
+ *
+ * That is, the sum collapses because all other CPUs are idle; the UP scenario.
+ *
+ * So what we do is modify our approximation (3) to approach (4) in the (near)
+ * UP case, like:
+ *
+ * ge->load.weight =
+ *
+ * tg->weight * grq->load.weight
+ * --------------------------------------------------- (5)
+ * tg->load_avg - grq->avg.load_avg + grq->load.weight
+ *
+ * But because grq->load.weight can drop to 0, resulting in a divide by zero,
+ * we need to use grq->avg.load_avg as its lower bound, which then gives:
+ *
+ *
+ * tg->weight * grq->load.weight
+ * ge->load.weight = ----------------------------- (6)
+ * tg_load_avg'
+ *
+ * Where:
+ *
+ * tg_load_avg' = tg->load_avg - grq->avg.load_avg +
+ * max(grq->load.weight, grq->avg.load_avg)
+ *
+ * And that is shares_weight and is icky. In the (near) UP case it approaches
+ * (4) while in the normal case it approaches (3). It consistently
+ * overestimates the ge->load.weight and therefore:
+ *
+ * \Sum ge->load.weight >= tg->weight
+ *
+ * hence icky!
+ */
+static long calc_group_shares(struct cfs_rq *cfs_rq)
{
- long tg_weight, load, shares;
+ long tg_weight, tg_shares, load, shares;
+ struct task_group *tg = cfs_rq->tg;
- /*
- * This really should be: cfs_rq->avg.load_avg, but instead we use
- * cfs_rq->load.weight, which is its upper bound. This helps ramp up
- * the shares for small weight interactive tasks.
- */
- load = scale_load_down(cfs_rq->load.weight);
+ tg_shares = READ_ONCE(tg->shares);
+
+ load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
tg_weight = atomic_long_read(&tg->load_avg);
@@ -2713,7 +2912,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
tg_weight -= cfs_rq->tg_load_avg_contrib;
tg_weight += load;
- shares = (tg->shares * load);
+ shares = (tg_shares * load);
if (tg_weight)
shares /= tg_weight;
@@ -2729,67 +2928,115 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
* case no task is runnable on a CPU MIN_SHARES=2 should be returned
* instead of 0.
*/
- if (shares < MIN_SHARES)
- shares = MIN_SHARES;
- if (shares > tg->shares)
- shares = tg->shares;
-
- return shares;
-}
-# else /* CONFIG_SMP */
-static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
-{
- return tg->shares;
+ return clamp_t(long, shares, MIN_SHARES, tg_shares);
}
-# endif /* CONFIG_SMP */
-static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
- unsigned long weight)
+/*
+ * This calculates the effective runnable weight for a group entity based on
+ * the group entity weight calculated above.
+ *
+ * Because of the above approximation (2), our group entity weight is
+ * an load_avg based ratio (3). This means that it includes blocked load and
+ * does not represent the runnable weight.
+ *
+ * Approximate the group entity's runnable weight per ratio from the group
+ * runqueue:
+ *
+ * grq->avg.runnable_load_avg
+ * ge->runnable_weight = ge->load.weight * -------------------------- (7)
+ * grq->avg.load_avg
+ *
+ * However, analogous to above, since the avg numbers are slow, this leads to
+ * transients in the from-idle case. Instead we use:
+ *
+ * ge->runnable_weight = ge->load.weight *
+ *
+ * max(grq->avg.runnable_load_avg, grq->runnable_weight)
+ * ----------------------------------------------------- (8)
+ * max(grq->avg.load_avg, grq->load.weight)
+ *
+ * Where these max() serve both to use the 'instant' values to fix the slow
+ * from-idle and avoid the /0 on to-idle, similar to (6).
+ */
+static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
{
- if (se->on_rq) {
- /* commit outstanding execution time */
- if (cfs_rq->curr == se)
- update_curr(cfs_rq);
- account_entity_dequeue(cfs_rq, se);
- }
+ long runnable, load_avg;
- update_load_set(&se->load, weight);
+ load_avg = max(cfs_rq->avg.load_avg,
+ scale_load_down(cfs_rq->load.weight));
- if (se->on_rq)
- account_entity_enqueue(cfs_rq, se);
+ runnable = max(cfs_rq->avg.runnable_load_avg,
+ scale_load_down(cfs_rq->runnable_weight));
+
+ runnable *= shares;
+ if (load_avg)
+ runnable /= load_avg;
+
+ return clamp_t(long, runnable, MIN_SHARES, shares);
}
+# endif /* CONFIG_SMP */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-static void update_cfs_shares(struct sched_entity *se)
+/*
+ * Recomputes the group entity based on the current state of its group
+ * runqueue.
+ */
+static void update_cfs_group(struct sched_entity *se)
{
- struct cfs_rq *cfs_rq = group_cfs_rq(se);
- struct task_group *tg;
- long shares;
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long shares, runnable;
- if (!cfs_rq)
+ if (!gcfs_rq)
return;
- if (throttled_hierarchy(cfs_rq))
+ if (throttled_hierarchy(gcfs_rq))
return;
- tg = cfs_rq->tg;
-
#ifndef CONFIG_SMP
- if (likely(se->load.weight == tg->shares))
+ runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
+
+ if (likely(se->load.weight == shares))
return;
+#else
+ shares = calc_group_shares(gcfs_rq);
+ runnable = calc_group_runnable(gcfs_rq, shares);
#endif
- shares = calc_cfs_shares(cfs_rq, tg);
- reweight_entity(cfs_rq_of(se), se, shares);
+ reweight_entity(cfs_rq_of(se), se, shares, runnable);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct sched_entity *se)
+static inline void update_cfs_group(struct sched_entity *se)
{
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+
+ if (&rq->cfs == cfs_rq) {
+ /*
+ * There are a few boundary cases this might miss but it should
+ * get called often enough that that should (hopefully) not be
+ * a real problem -- added to that it only calls on the local
+ * CPU, so if we enqueue remotely we'll miss an update, but
+ * the next tick/schedule should update.
+ *
+ * It will not get called when we go idle, because the idle
+ * thread is a different class (!fair), nor will the utilization
+ * number include things like RT tasks.
+ *
+ * As is, the util number is not freq-invariant (we'd have to
+ * implement arch_scale_freq_capacity() for that).
+ *
+ * See cpu_util().
+ */
+ cpufreq_update_util(rq, 0);
+ }
+}
+
#ifdef CONFIG_SMP
/*
* Approximate:
@@ -2869,7 +3116,7 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
*/
static __always_inline u32
accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
- unsigned long weight, int running, struct cfs_rq *cfs_rq)
+ unsigned long load, unsigned long runnable, int running)
{
unsigned long scale_freq, scale_cpu;
u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
@@ -2886,10 +3133,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
*/
if (periods) {
sa->load_sum = decay_load(sa->load_sum, periods);
- if (cfs_rq) {
- cfs_rq->runnable_load_sum =
- decay_load(cfs_rq->runnable_load_sum, periods);
- }
+ sa->runnable_load_sum =
+ decay_load(sa->runnable_load_sum, periods);
sa->util_sum = decay_load((u64)(sa->util_sum), periods);
/*
@@ -2902,11 +3147,10 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
sa->period_contrib = delta;
contrib = cap_scale(contrib, scale_freq);
- if (weight) {
- sa->load_sum += weight * contrib;
- if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * contrib;
- }
+ if (load)
+ sa->load_sum += load * contrib;
+ if (runnable)
+ sa->runnable_load_sum += runnable * contrib;
if (running)
sa->util_sum += contrib * scale_cpu;
@@ -2942,8 +3186,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
static __always_inline int
-___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
- unsigned long weight, int running, struct cfs_rq *cfs_rq)
+___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
+ unsigned long load, unsigned long runnable, int running)
{
u64 delta;
@@ -2968,69 +3212,114 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
sa->last_update_time += delta << 10;
/*
+ * running is a subset of runnable (weight) so running can't be set if
+ * runnable is clear. But there are some corner cases where the current
+ * se has been already dequeued but cfs_rq->curr still points to it.
+ * This means that weight will be 0 but not running for a sched_entity
+ * but also for a cfs_rq if the latter becomes idle. As an example,
+ * this happens during idle_balance() which calls
+ * update_blocked_averages()
+ */
+ if (!load)
+ runnable = running = 0;
+
+ /*
* Now we know we crossed measurement unit boundaries. The *_avg
* accrues by two steps:
*
* Step 1: accumulate *_sum since last_update_time. If we haven't
* crossed period boundaries, finish.
*/
- if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
+ if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
return 0;
+ return 1;
+}
+
+static __always_inline void
+___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
+{
+ u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+
/*
* Step 2: update *_avg.
*/
- sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
- if (cfs_rq) {
- cfs_rq->runnable_load_avg =
- div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
- }
- sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
-
- return 1;
+ sa->load_avg = div_u64(load * sa->load_sum, divider);
+ sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
+ sa->util_avg = sa->util_sum / divider;
}
+/*
+ * sched_entity:
+ *
+ * task:
+ * se_runnable() == se_weight()
+ *
+ * group: [ see update_cfs_group() ]
+ * se_weight() = tg->weight * grq->load_avg / tg->load_avg
+ * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
+ *
+ * load_sum := runnable_sum
+ * load_avg = se_weight(se) * runnable_avg
+ *
+ * runnable_load_sum := runnable_sum
+ * runnable_load_avg = se_runnable(se) * runnable_avg
+ *
+ * XXX collapse load_sum and runnable_load_sum
+ *
+ * cfq_rs:
+ *
+ * load_sum = \Sum se_weight(se) * se->avg.load_sum
+ * load_avg = \Sum se->avg.load_avg
+ *
+ * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
+ * runnable_load_avg = \Sum se->avg.runable_load_avg
+ */
+
static int
__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
{
- return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL);
+ if (entity_is_task(se))
+ se->runnable_weight = se->load.weight;
+
+ if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
+ ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+ return 1;
+ }
+
+ return 0;
}
static int
__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- return ___update_load_avg(now, cpu, &se->avg,
- se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
+ if (entity_is_task(se))
+ se->runnable_weight = se->load.weight;
+
+ if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
+ cfs_rq->curr == se)) {
+
+ ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+ return 1;
+ }
+
+ return 0;
}
static int
__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
{
- return ___update_load_avg(now, cpu, &cfs_rq->avg,
- scale_load_down(cfs_rq->load.weight),
- cfs_rq->curr != NULL, cfs_rq);
-}
+ if (___update_load_sum(now, cpu, &cfs_rq->avg,
+ scale_load_down(cfs_rq->load.weight),
+ scale_load_down(cfs_rq->runnable_weight),
+ cfs_rq->curr != NULL)) {
-/*
- * Signed add and clamp on underflow.
- *
- * Explicitly do a load-store to ensure the intermediate value never hits
- * memory. This allows lockless observations without ever seeing the negative
- * values.
- */
-#define add_positive(_ptr, _val) do { \
- typeof(_ptr) ptr = (_ptr); \
- typeof(_val) val = (_val); \
- typeof(*ptr) res, var = READ_ONCE(*ptr); \
- \
- res = var + val; \
- \
- if (val < 0 && res > var) \
- res = 0; \
- \
- WRITE_ONCE(*ptr, res); \
-} while (0)
+ ___update_load_avg(&cfs_rq->avg, 1, 1);
+ return 1;
+ }
+
+ return 0;
+}
#ifdef CONFIG_FAIR_GROUP_SCHED
/**
@@ -3113,11 +3402,77 @@ void set_task_rq_fair(struct sched_entity *se,
se->avg.last_update_time = n_last_update_time;
}
-/* Take into account change of utilization of a child task group */
+
+/*
+ * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
+ * propagate its contribution. The key to this propagation is the invariant
+ * that for each group:
+ *
+ * ge->avg == grq->avg (1)
+ *
+ * _IFF_ we look at the pure running and runnable sums. Because they
+ * represent the very same entity, just at different points in the hierarchy.
+ *
+ *
+ * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and
+ * simply copies the running sum over.
+ *
+ * However, update_tg_cfs_runnable() is more complex. So we have:
+ *
+ * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
+ *
+ * And since, like util, the runnable part should be directly transferable,
+ * the following would _appear_ to be the straight forward approach:
+ *
+ * grq->avg.load_avg = grq->load.weight * grq->avg.running_avg (3)
+ *
+ * And per (1) we have:
+ *
+ * ge->avg.running_avg == grq->avg.running_avg
+ *
+ * Which gives:
+ *
+ * ge->load.weight * grq->avg.load_avg
+ * ge->avg.load_avg = ----------------------------------- (4)
+ * grq->load.weight
+ *
+ * Except that is wrong!
+ *
+ * Because while for entities historical weight is not important and we
+ * really only care about our future and therefore can consider a pure
+ * runnable sum, runqueues can NOT do this.
+ *
+ * We specifically want runqueues to have a load_avg that includes
+ * historical weights. Those represent the blocked load, the load we expect
+ * to (shortly) return to us. This only works by keeping the weights as
+ * integral part of the sum. We therefore cannot decompose as per (3).
+ *
+ * OK, so what then?
+ *
+ *
+ * Another way to look at things is:
+ *
+ * grq->avg.load_avg = \Sum se->avg.load_avg
+ *
+ * Therefore, per (2):
+ *
+ * grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg
+ *
+ * And the very thing we're propagating is a change in that sum (someone
+ * joined/left). So we can easily know the runnable change, which would be, per
+ * (2) the already tracked se->load_avg divided by the corresponding
+ * se->weight.
+ *
+ * Basically (4) but in differential form:
+ *
+ * d(runnable_avg) += se->avg.load_avg / se->load.weight
+ * (5)
+ * ge->avg.load_avg += ge->load.weight * d(runnable_avg)
+ */
+
static inline void
-update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- struct cfs_rq *gcfs_rq = group_cfs_rq(se);
long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
/* Nothing to update */
@@ -3133,102 +3488,65 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
}
-/* Take into account change of load of a child task group */
static inline void
-update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- struct cfs_rq *gcfs_rq = group_cfs_rq(se);
- long delta, load = gcfs_rq->avg.load_avg;
+ long runnable_sum = gcfs_rq->prop_runnable_sum;
+ long runnable_load_avg, load_avg;
+ s64 runnable_load_sum, load_sum;
- /*
- * If the load of group cfs_rq is null, the load of the
- * sched_entity will also be null so we can skip the formula
- */
- if (load) {
- long tg_load;
+ if (!runnable_sum)
+ return;
- /* Get tg's load and ensure tg_load > 0 */
- tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
+ gcfs_rq->prop_runnable_sum = 0;
- /* Ensure tg_load >= load and updated with current load*/
- tg_load -= gcfs_rq->tg_load_avg_contrib;
- tg_load += load;
+ load_sum = (s64)se_weight(se) * runnable_sum;
+ load_avg = div_s64(load_sum, LOAD_AVG_MAX);
- /*
- * We need to compute a correction term in the case that the
- * task group is consuming more CPU than a task of equal
- * weight. A task with a weight equals to tg->shares will have
- * a load less or equal to scale_load_down(tg->shares).
- * Similarly, the sched_entities that represent the task group
- * at parent level, can't have a load higher than
- * scale_load_down(tg->shares). And the Sum of sched_entities'
- * load must be <= scale_load_down(tg->shares).
- */
- if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
- /* scale gcfs_rq's load into tg's shares*/
- load *= scale_load_down(gcfs_rq->tg->shares);
- load /= tg_load;
- }
- }
-
- delta = load - se->avg.load_avg;
+ add_positive(&se->avg.load_sum, runnable_sum);
+ add_positive(&se->avg.load_avg, load_avg);
- /* Nothing to update */
- if (!delta)
- return;
+ add_positive(&cfs_rq->avg.load_avg, load_avg);
+ add_positive(&cfs_rq->avg.load_sum, load_sum);
- /* Set new sched_entity's load */
- se->avg.load_avg = load;
- se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+ runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
+ runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
- /* Update parent cfs_rq load */
- add_positive(&cfs_rq->avg.load_avg, delta);
- cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+ add_positive(&se->avg.runnable_load_sum, runnable_sum);
+ add_positive(&se->avg.runnable_load_avg, runnable_load_avg);
- /*
- * If the sched_entity is already enqueued, we also have to update the
- * runnable load avg.
- */
if (se->on_rq) {
- /* Update parent cfs_rq runnable_load_avg */
- add_positive(&cfs_rq->runnable_load_avg, delta);
- cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+ add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg);
+ add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum);
}
}
-static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
{
- cfs_rq->propagate_avg = 1;
-}
-
-static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
-{
- struct cfs_rq *cfs_rq = group_cfs_rq(se);
-
- if (!cfs_rq->propagate_avg)
- return 0;
-
- cfs_rq->propagate_avg = 0;
- return 1;
+ cfs_rq->propagate = 1;
+ cfs_rq->prop_runnable_sum += runnable_sum;
}
/* Update task and its cfs_rq load average */
static inline int propagate_entity_load_avg(struct sched_entity *se)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq, *gcfs_rq;
if (entity_is_task(se))
return 0;
- if (!test_and_clear_tg_cfs_propagate(se))
+ gcfs_rq = group_cfs_rq(se);
+ if (!gcfs_rq->propagate)
return 0;
+ gcfs_rq->propagate = 0;
+
cfs_rq = cfs_rq_of(se);
- set_tg_cfs_propagate(cfs_rq);
+ add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
- update_tg_cfs_util(cfs_rq, se);
- update_tg_cfs_load(cfs_rq, se);
+ update_tg_cfs_util(cfs_rq, se, gcfs_rq);
+ update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
return 1;
}
@@ -3252,7 +3570,7 @@ static inline bool skip_blocked_update(struct sched_entity *se)
* If there is a pending propagation, we have to update the load and
* the utilization of the sched_entity:
*/
- if (gcfs_rq->propagate_avg)
+ if (gcfs_rq->propagate)
return false;
/*
@@ -3272,55 +3590,14 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
return 0;
}
-static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
-{
- if (&this_rq()->cfs == cfs_rq) {
- /*
- * There are a few boundary cases this might miss but it should
- * get called often enough that that should (hopefully) not be
- * a real problem -- added to that it only calls on the local
- * CPU, so if we enqueue remotely we'll miss an update, but
- * the next tick/schedule should update.
- *
- * It will not get called when we go idle, because the idle
- * thread is a different class (!fair), nor will the utilization
- * number include things like RT tasks.
- *
- * As is, the util number is not freq-invariant (we'd have to
- * implement arch_scale_freq_capacity() for that).
- *
- * See cpu_util().
- */
- cpufreq_update_util(rq_of(cfs_rq), 0);
- }
-}
-
-/*
- * Unsigned subtract and clamp on underflow.
- *
- * Explicitly do a load-store to ensure the intermediate value never hits
- * memory. This allows lockless observations without ever seeing the negative
- * values.
- */
-#define sub_positive(_ptr, _val) do { \
- typeof(_ptr) ptr = (_ptr); \
- typeof(*ptr) val = (_val); \
- typeof(*ptr) res, var = READ_ONCE(*ptr); \
- res = var - val; \
- if (res > var) \
- res = 0; \
- WRITE_ONCE(*ptr, res); \
-} while (0)
-
/**
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
* @now: current time, as per cfs_rq_clock_task()
* @cfs_rq: cfs_rq to update
- * @update_freq: should we call cfs_rq_util_change() or will the call do so
*
* The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
* avg. The immediate corollary is that all (fair) tasks must be attached, see
@@ -3334,67 +3611,47 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
* call update_tg_load_avg() when this function returns true.
*/
static inline int
-update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
+ unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
struct sched_avg *sa = &cfs_rq->avg;
- int decayed, removed_load = 0, removed_util = 0;
+ int decayed = 0;
- if (atomic_long_read(&cfs_rq->removed_load_avg)) {
- s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
+ if (cfs_rq->removed.nr) {
+ unsigned long r;
+ u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+
+ raw_spin_lock(&cfs_rq->removed.lock);
+ swap(cfs_rq->removed.util_avg, removed_util);
+ swap(cfs_rq->removed.load_avg, removed_load);
+ swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
+ cfs_rq->removed.nr = 0;
+ raw_spin_unlock(&cfs_rq->removed.lock);
+
+ r = removed_load;
sub_positive(&sa->load_avg, r);
- sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
- removed_load = 1;
- set_tg_cfs_propagate(cfs_rq);
- }
+ sub_positive(&sa->load_sum, r * divider);
- if (atomic_long_read(&cfs_rq->removed_util_avg)) {
- long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
+ r = removed_util;
sub_positive(&sa->util_avg, r);
- sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
- removed_util = 1;
- set_tg_cfs_propagate(cfs_rq);
+ sub_positive(&sa->util_sum, r * divider);
+
+ add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
+
+ decayed = 1;
}
- decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
+ decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
#ifndef CONFIG_64BIT
smp_wmb();
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
- if (update_freq && (decayed || removed_util))
+ if (decayed)
cfs_rq_util_change(cfs_rq);
- return decayed || removed_load;
-}
-
-/*
- * Optional action to be done while updating the load average
- */
-#define UPDATE_TG 0x1
-#define SKIP_AGE_LOAD 0x2
-
-/* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int flags)
-{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 now = cfs_rq_clock_task(cfs_rq);
- struct rq *rq = rq_of(cfs_rq);
- int cpu = cpu_of(rq);
- int decayed;
-
- /*
- * Track task load average for carrying it to new CPU after migrated, and
- * track group sched_entity load average for task_h_load calc in migration
- */
- if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
- __update_load_avg_se(now, cpu, cfs_rq, se);
-
- decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
- decayed |= propagate_entity_load_avg(se);
-
- if (decayed && (flags & UPDATE_TG))
- update_tg_load_avg(cfs_rq, 0);
+ return decayed;
}
/**
@@ -3407,12 +3664,39 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
*/
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+
+ /*
+ * When we attach the @se to the @cfs_rq, we must align the decay
+ * window because without that, really weird and wonderful things can
+ * happen.
+ *
+ * XXX illustrate
+ */
se->avg.last_update_time = cfs_rq->avg.last_update_time;
- cfs_rq->avg.load_avg += se->avg.load_avg;
- cfs_rq->avg.load_sum += se->avg.load_sum;
+ se->avg.period_contrib = cfs_rq->avg.period_contrib;
+
+ /*
+ * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
+ * period_contrib. This isn't strictly correct, but since we're
+ * entirely outside of the PELT hierarchy, nobody cares if we truncate
+ * _sum a little.
+ */
+ se->avg.util_sum = se->avg.util_avg * divider;
+
+ se->avg.load_sum = divider;
+ if (se_weight(se)) {
+ se->avg.load_sum =
+ div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
+ }
+
+ se->avg.runnable_load_sum = se->avg.load_sum;
+
+ enqueue_load_avg(cfs_rq, se);
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
- set_tg_cfs_propagate(cfs_rq);
+
+ add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
cfs_rq_util_change(cfs_rq);
}
@@ -3427,39 +3711,47 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
*/
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
-
- sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
- sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
+ dequeue_load_avg(cfs_rq, se);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
- set_tg_cfs_propagate(cfs_rq);
+
+ add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
cfs_rq_util_change(cfs_rq);
}
-/* Add the load generated by se into cfs_rq's load average */
-static inline void
-enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+/*
+ * Optional action to be done while updating the load average
+ */
+#define UPDATE_TG 0x1
+#define SKIP_AGE_LOAD 0x2
+#define DO_ATTACH 0x4
+
+/* Update task and its cfs_rq load average */
+static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- struct sched_avg *sa = &se->avg;
+ u64 now = cfs_rq_clock_task(cfs_rq);
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
+ int decayed;
- cfs_rq->runnable_load_avg += sa->load_avg;
- cfs_rq->runnable_load_sum += sa->load_sum;
+ /*
+ * Track task load average for carrying it to new CPU after migrated, and
+ * track group sched_entity load average for task_h_load calc in migration
+ */
+ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
+ __update_load_avg_se(now, cpu, cfs_rq, se);
+
+ decayed = update_cfs_rq_load_avg(now, cfs_rq);
+ decayed |= propagate_entity_load_avg(se);
+
+ if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
- if (!sa->last_update_time) {
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, 0);
- }
-}
-/* Remove the runnable load generated by se from cfs_rq's runnable load average */
-static inline void
-dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- cfs_rq->runnable_load_avg =
- max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
- cfs_rq->runnable_load_sum =
- max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
+ } else if (decayed && (flags & UPDATE_TG))
+ update_tg_load_avg(cfs_rq, 0);
}
#ifndef CONFIG_64BIT
@@ -3503,6 +3795,7 @@ void sync_entity_load_avg(struct sched_entity *se)
void remove_entity_load_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ unsigned long flags;
/*
* tasks cannot exit without having gone through wake_up_new_task() ->
@@ -3515,13 +3808,18 @@ void remove_entity_load_avg(struct sched_entity *se)
*/
sync_entity_load_avg(se);
- atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
- atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
+
+ raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
+ ++cfs_rq->removed.nr;
+ cfs_rq->removed.util_avg += se->avg.util_avg;
+ cfs_rq->removed.load_avg += se->avg.load_avg;
+ cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */
+ raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
}
static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
{
- return cfs_rq->runnable_load_avg;
+ return cfs_rq->avg.runnable_load_avg;
}
static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
@@ -3534,23 +3832,20 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
#else /* CONFIG_SMP */
static inline int
-update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
return 0;
}
#define UPDATE_TG 0x0
#define SKIP_AGE_LOAD 0x0
+#define DO_ATTACH 0x0
-static inline void update_load_avg(struct sched_entity *se, int not_used1)
+static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
{
- cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
+ cfs_rq_util_change(cfs_rq);
}
-static inline void
-enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-static inline void
-dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void remove_entity_load_avg(struct sched_entity *se) {}
static inline void
@@ -3695,9 +3990,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* its group cfs_rq
* - Add its new weight to cfs_rq->load.weight
*/
- update_load_avg(se, UPDATE_TG);
- enqueue_entity_load_avg(cfs_rq, se);
- update_cfs_shares(se);
+ update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
+ update_cfs_group(se);
+ enqueue_runnable_load_avg(cfs_rq, se);
account_entity_enqueue(cfs_rq, se);
if (flags & ENQUEUE_WAKEUP)
@@ -3779,8 +4074,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* - For group entity, update its weight to reflect the new share
* of its group cfs_rq.
*/
- update_load_avg(se, UPDATE_TG);
- dequeue_entity_load_avg(cfs_rq, se);
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+ dequeue_runnable_load_avg(cfs_rq, se);
update_stats_dequeue(cfs_rq, se, flags);
@@ -3803,7 +4098,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
- update_cfs_shares(se);
+ update_cfs_group(se);
/*
* Now advance min_vruntime if @se was the entity holding it back,
@@ -3867,7 +4162,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
- update_load_avg(se, UPDATE_TG);
+ update_load_avg(cfs_rq, se, UPDATE_TG);
}
update_stats_curr_start(cfs_rq, se);
@@ -3969,7 +4264,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
- update_load_avg(prev, 0);
+ update_load_avg(cfs_rq, prev, 0);
}
cfs_rq->curr = NULL;
}
@@ -3985,8 +4280,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
/*
* Ensure that runnable average is periodically updated.
*/
- update_load_avg(curr, UPDATE_TG);
- update_cfs_shares(curr);
+ update_load_avg(cfs_rq, curr, UPDATE_TG);
+ update_cfs_group(curr);
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -4875,7 +5170,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
* passed.
*/
if (p->in_iowait)
- cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
+ cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
for_each_sched_entity(se) {
if (se->on_rq)
@@ -4903,8 +5198,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, UPDATE_TG);
- update_cfs_shares(se);
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+ update_cfs_group(se);
}
if (!se)
@@ -4962,8 +5257,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, UPDATE_TG);
- update_cfs_shares(se);
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+ update_cfs_group(se);
}
if (!se)
@@ -5125,9 +5420,9 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
}
/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
+static unsigned long weighted_cpuload(struct rq *rq)
{
- return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
+ return cfs_rq_runnable_load_avg(&rq->cfs);
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -5172,7 +5467,7 @@ static void cpu_load_update_idle(struct rq *this_rq)
/*
* bail if there's load or we're actually up-to-date.
*/
- if (weighted_cpuload(cpu_of(this_rq)))
+ if (weighted_cpuload(this_rq))
return;
cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
@@ -5193,7 +5488,7 @@ void cpu_load_update_nohz_start(void)
* concurrently we'll exit nohz. And cpu_load write can race with
* cpu_load_update_idle() but both updater would be writing the same.
*/
- this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
+ this_rq->cpu_load[0] = weighted_cpuload(this_rq);
}
/*
@@ -5209,7 +5504,7 @@ void cpu_load_update_nohz_stop(void)
if (curr_jiffies == this_rq->last_load_update_tick)
return;
- load = weighted_cpuload(cpu_of(this_rq));
+ load = weighted_cpuload(this_rq);
rq_lock(this_rq, &rf);
update_rq_clock(this_rq);
cpu_load_update_nohz(this_rq, curr_jiffies, load);
@@ -5235,7 +5530,7 @@ static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
*/
void cpu_load_update_active(struct rq *this_rq)
{
- unsigned long load = weighted_cpuload(cpu_of(this_rq));
+ unsigned long load = weighted_cpuload(this_rq);
if (tick_nohz_tick_stopped())
cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
@@ -5253,7 +5548,7 @@ void cpu_load_update_active(struct rq *this_rq)
static unsigned long source_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(cpu);
+ unsigned long total = weighted_cpuload(rq);
if (type == 0 || !sched_feat(LB_BIAS))
return total;
@@ -5268,7 +5563,7 @@ static unsigned long source_load(int cpu, int type)
static unsigned long target_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(cpu);
+ unsigned long total = weighted_cpuload(rq);
if (type == 0 || !sched_feat(LB_BIAS))
return total;
@@ -5290,7 +5585,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
- unsigned long load_avg = weighted_cpuload(cpu);
+ unsigned long load_avg = weighted_cpuload(rq);
if (nr_running)
return load_avg / nr_running;
@@ -5345,20 +5640,77 @@ static int wake_wide(struct task_struct *p)
return 1;
}
+/*
+ * The purpose of wake_affine() is to quickly determine on which CPU we can run
+ * soonest. For the purpose of speed we only consider the waking and previous
+ * CPU.
+ *
+ * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
+ * will be) idle.
+ *
+ * wake_affine_weight() - considers the weight to reflect the average
+ * scheduling latency of the CPUs. This seems to work
+ * for the overloaded case.
+ */
+
+static bool
+wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int prev_cpu, int sync)
+{
+ if (idle_cpu(this_cpu))
+ return true;
+
+ if (sync && cpu_rq(this_cpu)->nr_running == 1)
+ return true;
+
+ return false;
+}
+
+static bool
+wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int prev_cpu, int sync)
+{
+ s64 this_eff_load, prev_eff_load;
+ unsigned long task_load;
+
+ this_eff_load = target_load(this_cpu, sd->wake_idx);
+ prev_eff_load = source_load(prev_cpu, sd->wake_idx);
+
+ if (sync) {
+ unsigned long current_load = task_h_load(current);
+
+ if (current_load > this_eff_load)
+ return true;
+
+ this_eff_load -= current_load;
+ }
+
+ task_load = task_h_load(p);
+
+ this_eff_load += task_load;
+ if (sched_feat(WA_BIAS))
+ this_eff_load *= 100;
+ this_eff_load *= capacity_of(prev_cpu);
+
+ prev_eff_load -= task_load;
+ if (sched_feat(WA_BIAS))
+ prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load *= capacity_of(this_cpu);
+
+ return this_eff_load <= prev_eff_load;
+}
+
static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int prev_cpu, int sync)
{
int this_cpu = smp_processor_id();
bool affine = false;
- /*
- * Common case: CPUs are in the same socket, and select_idle_sibling()
- * will do its thing regardless of what we return:
- */
- if (cpus_share_cache(prev_cpu, this_cpu))
- affine = true;
- else
- affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync);
+ if (sched_feat(WA_IDLE) && !affine)
+ affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
+
+ if (sched_feat(WA_WEIGHT) && !affine)
+ affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
if (affine) {
@@ -5380,6 +5732,8 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
+ *
+ * Assumes p is allowed on at least one CPU in sd.
*/
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
@@ -5387,8 +5741,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
{
struct sched_group *idlest = NULL, *group = sd->groups;
struct sched_group *most_spare_sg = NULL;
- unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
- unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
+ unsigned long min_runnable_load = ULONG_MAX;
+ unsigned long this_runnable_load = ULONG_MAX;
+ unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
unsigned long most_spare = 0, this_spare = 0;
int load_idx = sd->forkexec_idx;
int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
@@ -5509,10 +5864,10 @@ skip_spare:
}
/*
- * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
*/
static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
unsigned int min_exit_latency = UINT_MAX;
@@ -5550,7 +5905,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
shallowest_idle_cpu = i;
}
} else if (shallowest_idle_cpu == -1) {
- load = weighted_cpuload(i);
+ load = weighted_cpuload(cpu_rq(i));
if (load < min_load || (load == min_load && i == this_cpu)) {
min_load = load;
least_loaded_cpu = i;
@@ -5561,6 +5916,53 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
+static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
+ int cpu, int prev_cpu, int sd_flag)
+{
+ int new_cpu = cpu;
+
+ if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+ return prev_cpu;
+
+ while (sd) {
+ struct sched_group *group;
+ struct sched_domain *tmp;
+ int weight;
+
+ if (!(sd->flags & sd_flag)) {
+ sd = sd->child;
+ continue;
+ }
+
+ group = find_idlest_group(sd, p, cpu, sd_flag);
+ if (!group) {
+ sd = sd->child;
+ continue;
+ }
+
+ new_cpu = find_idlest_group_cpu(group, p, cpu);
+ if (new_cpu == cpu) {
+ /* Now try balancing at a lower domain level of cpu */
+ sd = sd->child;
+ continue;
+ }
+
+ /* Now try balancing at a lower domain level of new_cpu */
+ cpu = new_cpu;
+ weight = sd->span_weight;
+ sd = NULL;
+ for_each_domain(cpu, tmp) {
+ if (weight <= tmp->span_weight)
+ break;
+ if (tmp->flags & sd_flag)
+ sd = tmp;
+ }
+ /* while loop will break here if sd == NULL */
+ }
+
+ return new_cpu;
+}
+
#ifdef CONFIG_SCHED_SMT
static inline void set_idle_cores(int cpu, int val)
@@ -5913,50 +6315,30 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
new_cpu = cpu;
}
+ if (sd && !(sd_flag & SD_BALANCE_FORK)) {
+ /*
+ * We're going to need the task's util for capacity_spare_wake
+ * in find_idlest_group. Sync it up to prev_cpu's
+ * last_update_time.
+ */
+ sync_entity_load_avg(&p->se);
+ }
+
if (!sd) {
- pick_cpu:
+pick_cpu:
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
- } else while (sd) {
- struct sched_group *group;
- int weight;
-
- if (!(sd->flags & sd_flag)) {
- sd = sd->child;
- continue;
- }
-
- group = find_idlest_group(sd, p, cpu, sd_flag);
- if (!group) {
- sd = sd->child;
- continue;
- }
-
- new_cpu = find_idlest_cpu(group, p, cpu);
- if (new_cpu == -1 || new_cpu == cpu) {
- /* Now try balancing at a lower domain level of cpu */
- sd = sd->child;
- continue;
- }
-
- /* Now try balancing at a lower domain level of new_cpu */
- cpu = new_cpu;
- weight = sd->span_weight;
- sd = NULL;
- for_each_domain(cpu, tmp) {
- if (weight <= tmp->span_weight)
- break;
- if (tmp->flags & sd_flag)
- sd = tmp;
- }
- /* while loop will break here if sd == NULL */
+ } else {
+ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
}
rcu_read_unlock();
return new_cpu;
}
+static void detach_entity_cfs_rq(struct sched_entity *se);
+
/*
* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
* cfs_rq_of(p) references at time of call are still valid and identify the
@@ -5990,14 +6372,25 @@ static void migrate_task_rq_fair(struct task_struct *p)
se->vruntime -= min_vruntime;
}
- /*
- * We are supposed to update the task to "current" time, then its up to date
- * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
- * what current time is, so simply throw away the out-of-date time. This
- * will result in the wakee task is less decayed, but giving the wakee more
- * load sounds not bad.
- */
- remove_entity_load_avg(&p->se);
+ if (p->on_rq == TASK_ON_RQ_MIGRATING) {
+ /*
+ * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
+ * rq->lock and can modify state directly.
+ */
+ lockdep_assert_held(&task_rq(p)->lock);
+ detach_entity_cfs_rq(&p->se);
+
+ } else {
+ /*
+ * We are supposed to update the task to "current" time, then
+ * its up to date and ready to go to new CPU/cfs_rq. But we
+ * have difficulty in getting what current time is, so simply
+ * throw away the out-of-date time. This will result in the
+ * wakee task is less decayed, but giving the wakee more load
+ * sounds not bad.
+ */
+ remove_entity_load_avg(&p->se);
+ }
/* Tell new CPU we are migrated */
p->se.avg.last_update_time = 0;
@@ -6187,10 +6580,10 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
int new_tasks;
again:
-#ifdef CONFIG_FAIR_GROUP_SCHED
if (!cfs_rq->nr_running)
goto idle;
+#ifdef CONFIG_FAIR_GROUP_SCHED
if (prev->sched_class != &fair_sched_class)
goto simple;
@@ -6220,11 +6613,17 @@ again:
/*
* This call to check_cfs_rq_runtime() will do the
* throttle and dequeue its entity in the parent(s).
- * Therefore the 'simple' nr_running test will indeed
+ * Therefore the nr_running test will indeed
* be correct.
*/
- if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+ if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
+ cfs_rq = &rq->cfs;
+
+ if (!cfs_rq->nr_running)
+ goto idle;
+
goto simple;
+ }
}
se = pick_next_entity(cfs_rq, curr);
@@ -6259,17 +6658,10 @@ again:
set_next_entity(cfs_rq, se);
}
- if (hrtick_enabled(rq))
- hrtick_start_fair(rq, p);
-
- return p;
+ goto done;
simple:
- cfs_rq = &rq->cfs;
#endif
- if (!cfs_rq->nr_running)
- goto idle;
-
put_prev_task(rq, prev);
do {
@@ -6280,6 +6672,16 @@ simple:
p = task_of(se);
+done: __maybe_unused
+#ifdef CONFIG_SMP
+ /*
+ * Move the next running task to the front of
+ * the list, so our cfs_tasks list becomes MRU
+ * one.
+ */
+ list_move(&p->se.group_node, &rq->cfs_tasks);
+#endif
+
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
@@ -6715,11 +7117,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
*/
static struct task_struct *detach_one_task(struct lb_env *env)
{
- struct task_struct *p, *n;
+ struct task_struct *p;
lockdep_assert_held(&env->src_rq->lock);
- list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+ list_for_each_entry_reverse(p,
+ &env->src_rq->cfs_tasks, se.group_node) {
if (!can_migrate_task(p, env))
continue;
@@ -6765,7 +7168,7 @@ static int detach_tasks(struct lb_env *env)
if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
break;
- p = list_first_entry(tasks, struct task_struct, se.group_node);
+ p = list_last_entry(tasks, struct task_struct, se.group_node);
env->loop++;
/* We've more or less seen every task there is, call it quits */
@@ -6815,7 +7218,7 @@ static int detach_tasks(struct lb_env *env)
continue;
next:
- list_move_tail(&p->se.group_node, tasks);
+ list_move(&p->se.group_node, tasks);
}
/*
@@ -6891,7 +7294,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
if (cfs_rq->avg.util_sum)
return false;
- if (cfs_rq->runnable_load_sum)
+ if (cfs_rq->avg.runnable_load_sum)
return false;
return true;
@@ -6917,13 +7320,13 @@ static void update_blocked_averages(int cpu)
if (throttled_hierarchy(cfs_rq))
continue;
- if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
+ if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
update_tg_load_avg(cfs_rq, 0);
/* Propagate pending load changes to the parent, if any: */
se = cfs_rq->tg->se[cpu];
if (se && !skip_blocked_update(se))
- update_load_avg(se, 0);
+ update_load_avg(cfs_rq_of(se), se, 0);
/*
* There can be a lot of idle CPU cgroups. Don't let fully
@@ -6990,7 +7393,7 @@ static inline void update_blocked_averages(int cpu)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
- update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
+ update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
rq_unlock_irqrestore(rq, &rf);
}
@@ -7036,6 +7439,7 @@ struct sg_lb_stats {
struct sd_lb_stats {
struct sched_group *busiest; /* Busiest group in this sd */
struct sched_group *local; /* Local group in this sd */
+ unsigned long total_running;
unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
@@ -7055,6 +7459,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
*sds = (struct sd_lb_stats){
.busiest = NULL,
.local = NULL,
+ .total_running = 0UL,
.total_load = 0UL,
.total_capacity = 0UL,
.busiest_stat = {
@@ -7363,7 +7768,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
- sgs->sum_weighted_load += weighted_cpuload(i);
+ sgs->sum_weighted_load += weighted_cpuload(rq);
/*
* No need to call idle_cpu() if nr_running is not 0
*/
@@ -7546,6 +7951,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
next_group:
/* Now, start updating sd_lb_stats */
+ sds->total_running += sgs->sum_nr_running;
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
@@ -7560,7 +7966,6 @@ next_group:
if (env->dst_rq->rd->overload != overload)
env->dst_rq->rd->overload = overload;
}
-
}
/**
@@ -7581,7 +7986,7 @@ next_group:
* number.
*
* Return: 1 when packing is required and a task should be moved to
- * this CPU. The amount of the imbalance is returned in *imbalance.
+ * this CPU. The amount of the imbalance is returned in env->imbalance.
*
* @env: The load balancing environment.
* @sds: Statistics of the sched_domain which is to be packed
@@ -7790,6 +8195,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (!sds.busiest || busiest->sum_nr_running == 0)
goto out_balanced;
+ /* XXX broken for overlapping NUMA groups */
sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
/ sds.total_capacity;
@@ -7801,8 +8207,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (busiest->group_type == group_imbalanced)
goto force_balance;
- /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
- if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
+ /*
+ * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
+ * capacities from resulting in underutilization due to avg_load.
+ */
+ if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
busiest->group_no_capacity)
goto force_balance;
@@ -7892,7 +8301,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
capacity = capacity_of(i);
- wl = weighted_cpuload(i);
+ wl = weighted_cpuload(rq);
/*
* When comparing with imbalance, use weighted_cpuload()
@@ -7970,6 +8379,13 @@ static int should_we_balance(struct lb_env *env)
int cpu, balance_cpu = -1;
/*
+ * Ensure the balancing environment is consistent; can happen
+ * when the softirq triggers 'during' hotplug.
+ */
+ if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
+ return 0;
+
+ /*
* In the newly idle case, we will allow all the cpu's
* to do the newly idle load balance.
*/
@@ -8309,6 +8725,12 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
this_rq->idle_stamp = rq_clock(this_rq);
/*
+ * Do not pull tasks towards !active CPUs...
+ */
+ if (!cpu_active(this_cpu))
+ return 0;
+
+ /*
* This is OK, because current is on_cpu, which avoids it being picked
* for load-balance and preemption/IRQs are still disabled avoiding
* further scheduler activity on it and we're being very careful to
@@ -8415,6 +8837,13 @@ static int active_load_balance_cpu_stop(void *data)
struct rq_flags rf;
rq_lock_irq(busiest_rq, &rf);
+ /*
+ * Between queueing the stop-work and running it is a hole in which
+ * CPUs can become inactive. We should not move tasks from or to
+ * inactive CPUs.
+ */
+ if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
+ goto out_unlock;
/* make sure the requested cpu hasn't gone down in the meantime */
if (unlikely(busiest_cpu != smp_processor_id() ||
@@ -8599,7 +9028,7 @@ void nohz_balance_enter_idle(int cpu)
return;
/* Spare idle load balancing on CPUs that don't want to be disturbed: */
- if (!is_housekeeping_cpu(cpu))
+ if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
return;
if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
@@ -9064,7 +9493,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, UPDATE_TG);
+ update_load_avg(cfs_rq, se, UPDATE_TG);
}
}
#else
@@ -9076,7 +9505,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
/* Catch up with the cfs_rq and remove our load when we leave */
- update_load_avg(se, 0);
+ update_load_avg(cfs_rq, se, 0);
detach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
propagate_entity_cfs_rq(se);
@@ -9095,7 +9524,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
#endif
/* Synchronize entity with its cfs_rq */
- update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
+ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
propagate_entity_cfs_rq(se);
@@ -9171,17 +9600,13 @@ static void set_curr_task_fair(struct rq *rq)
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
- cfs_rq->tasks_timeline = RB_ROOT;
+ cfs_rq->tasks_timeline = RB_ROOT_CACHED;
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
#ifndef CONFIG_64BIT
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
#ifdef CONFIG_SMP
-#ifdef CONFIG_FAIR_GROUP_SCHED
- cfs_rq->propagate_avg = 0;
-#endif
- atomic_long_set(&cfs_rq->removed_load_avg, 0);
- atomic_long_set(&cfs_rq->removed_util_avg, 0);
+ raw_spin_lock_init(&cfs_rq->removed.lock);
#endif
}
@@ -9379,8 +9804,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
for_each_sched_entity(se) {
- update_load_avg(se, UPDATE_TG);
- update_cfs_shares(se);
+ update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
+ update_cfs_group(se);
}
rq_unlock_irqrestore(rq, &rf);
}