summaryrefslogtreecommitdiff
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c917
1 files changed, 603 insertions, 314 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e5212ae294f6..468bdd44c1ba 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -83,6 +83,7 @@
#include "sched.h"
#include "../workqueue_sched.h"
+#include "../smpboot.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
@@ -141,9 +142,8 @@ const_debug unsigned int sysctl_sched_features =
#define SCHED_FEAT(name, enabled) \
#name ,
-static __read_mostly char *sched_feat_names[] = {
+static const char * const sched_feat_names[] = {
#include "features.h"
- NULL
};
#undef SCHED_FEAT
@@ -692,8 +692,6 @@ int tg_nop(struct task_group *tg, void *data)
}
#endif
-void update_cpu_load(struct rq *this_rq);
-
static void set_load_weight(struct task_struct *p)
{
int prio = p->static_prio - MAX_RT_PRIO;
@@ -2162,11 +2160,73 @@ unsigned long this_cpu_load(void)
}
+/*
+ * Global load-average calculations
+ *
+ * We take a distributed and async approach to calculating the global load-avg
+ * in order to minimize overhead.
+ *
+ * The global load average is an exponentially decaying average of nr_running +
+ * nr_uninterruptible.
+ *
+ * Once every LOAD_FREQ:
+ *
+ * nr_active = 0;
+ * for_each_possible_cpu(cpu)
+ * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
+ *
+ * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
+ *
+ * Due to a number of reasons the above turns in the mess below:
+ *
+ * - for_each_possible_cpu() is prohibitively expensive on machines with
+ * serious number of cpus, therefore we need to take a distributed approach
+ * to calculating nr_active.
+ *
+ * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
+ * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
+ *
+ * So assuming nr_active := 0 when we start out -- true per definition, we
+ * can simply take per-cpu deltas and fold those into a global accumulate
+ * to obtain the same result. See calc_load_fold_active().
+ *
+ * Furthermore, in order to avoid synchronizing all per-cpu delta folding
+ * across the machine, we assume 10 ticks is sufficient time for every
+ * cpu to have completed this task.
+ *
+ * This places an upper-bound on the IRQ-off latency of the machine. Then
+ * again, being late doesn't loose the delta, just wrecks the sample.
+ *
+ * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
+ * this would add another cross-cpu cacheline miss and atomic operation
+ * to the wakeup path. Instead we increment on whatever cpu the task ran
+ * when it went into uninterruptible state and decrement on whatever cpu
+ * did the wakeup. This means that only the sum of nr_uninterruptible over
+ * all cpus yields the correct result.
+ *
+ * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
+ */
+
/* Variables and functions for calc_load */
static atomic_long_t calc_load_tasks;
static unsigned long calc_load_update;
unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun);
+EXPORT_SYMBOL(avenrun); /* should be removed */
+
+/**
+ * get_avenrun - get the load average array
+ * @loads: pointer to dest load array
+ * @offset: offset to add
+ * @shift: shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+ loads[0] = (avenrun[0] + offset) << shift;
+ loads[1] = (avenrun[1] + offset) << shift;
+ loads[2] = (avenrun[2] + offset) << shift;
+}
static long calc_load_fold_active(struct rq *this_rq)
{
@@ -2183,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq)
return delta;
}
+/*
+ * a1 = a0 * e + a * (1 - e)
+ */
static unsigned long
calc_load(unsigned long load, unsigned long exp, unsigned long active)
{
@@ -2194,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
#ifdef CONFIG_NO_HZ
/*
- * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
+ * Handle NO_HZ for the global load-average.
+ *
+ * Since the above described distributed algorithm to compute the global
+ * load-average relies on per-cpu sampling from the tick, it is affected by
+ * NO_HZ.
+ *
+ * The basic idea is to fold the nr_active delta into a global idle-delta upon
+ * entering NO_HZ state such that we can include this as an 'extra' cpu delta
+ * when we read the global state.
+ *
+ * Obviously reality has to ruin such a delightfully simple scheme:
+ *
+ * - When we go NO_HZ idle during the window, we can negate our sample
+ * contribution, causing under-accounting.
+ *
+ * We avoid this by keeping two idle-delta counters and flipping them
+ * when the window starts, thus separating old and new NO_HZ load.
+ *
+ * The only trick is the slight shift in index flip for read vs write.
+ *
+ * 0s 5s 10s 15s
+ * +10 +10 +10 +10
+ * |-|-----------|-|-----------|-|-----------|-|
+ * r:0 0 1 1 0 0 1 1 0
+ * w:0 1 1 0 0 1 1 0 0
+ *
+ * This ensures we'll fold the old idle contribution in this window while
+ * accumlating the new one.
+ *
+ * - When we wake up from NO_HZ idle during the window, we push up our
+ * contribution, since we effectively move our sample point to a known
+ * busy state.
+ *
+ * This is solved by pushing the window forward, and thus skipping the
+ * sample, for this cpu (effectively using the idle-delta for this cpu which
+ * was in effect at the time the window opened). This also solves the issue
+ * of having to deal with a cpu having been in NOHZ idle for multiple
+ * LOAD_FREQ intervals.
*
* When making the ILB scale, we should try to pull this in as well.
*/
-static atomic_long_t calc_load_tasks_idle;
+static atomic_long_t calc_load_idle[2];
+static int calc_load_idx;
+
+static inline int calc_load_write_idx(void)
+{
+ int idx = calc_load_idx;
+
+ /*
+ * See calc_global_nohz(), if we observe the new index, we also
+ * need to observe the new update time.
+ */
+ smp_rmb();
-void calc_load_account_idle(struct rq *this_rq)
+ /*
+ * If the folding window started, make sure we start writing in the
+ * next idle-delta.
+ */
+ if (!time_before(jiffies, calc_load_update))
+ idx++;
+
+ return idx & 1;
+}
+
+static inline int calc_load_read_idx(void)
{
+ return calc_load_idx & 1;
+}
+
+void calc_load_enter_idle(void)
+{
+ struct rq *this_rq = this_rq();
long delta;
+ /*
+ * We're going into NOHZ mode, if there's any pending delta, fold it
+ * into the pending idle delta.
+ */
delta = calc_load_fold_active(this_rq);
- if (delta)
- atomic_long_add(delta, &calc_load_tasks_idle);
+ if (delta) {
+ int idx = calc_load_write_idx();
+ atomic_long_add(delta, &calc_load_idle[idx]);
+ }
}
-static long calc_load_fold_idle(void)
+void calc_load_exit_idle(void)
{
- long delta = 0;
+ struct rq *this_rq = this_rq();
/*
- * Its got a race, we don't care...
+ * If we're still before the sample window, we're done.
*/
- if (atomic_long_read(&calc_load_tasks_idle))
- delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
+ if (time_before(jiffies, this_rq->calc_load_update))
+ return;
+
+ /*
+ * We woke inside or after the sample window, this means we're already
+ * accounted through the nohz accounting, so skip the entire deal and
+ * sync up for the next window.
+ */
+ this_rq->calc_load_update = calc_load_update;
+ if (time_before(jiffies, this_rq->calc_load_update + 10))
+ this_rq->calc_load_update += LOAD_FREQ;
+}
+
+static long calc_load_fold_idle(void)
+{
+ int idx = calc_load_read_idx();
+ long delta = 0;
+
+ if (atomic_long_read(&calc_load_idle[idx]))
+ delta = atomic_long_xchg(&calc_load_idle[idx], 0);
return delta;
}
@@ -2303,66 +2454,39 @@ static void calc_global_nohz(void)
{
long delta, active, n;
- /*
- * If we crossed a calc_load_update boundary, make sure to fold
- * any pending idle changes, the respective CPUs might have
- * missed the tick driven calc_load_account_active() update
- * due to NO_HZ.
- */
- delta = calc_load_fold_idle();
- if (delta)
- atomic_long_add(delta, &calc_load_tasks);
-
- /*
- * It could be the one fold was all it took, we done!
- */
- if (time_before(jiffies, calc_load_update + 10))
- return;
-
- /*
- * Catch-up, fold however many we are behind still
- */
- delta = jiffies - calc_load_update - 10;
- n = 1 + (delta / LOAD_FREQ);
+ if (!time_before(jiffies, calc_load_update + 10)) {
+ /*
+ * Catch-up, fold however many we are behind still
+ */
+ delta = jiffies - calc_load_update - 10;
+ n = 1 + (delta / LOAD_FREQ);
- active = atomic_long_read(&calc_load_tasks);
- active = active > 0 ? active * FIXED_1 : 0;
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
- avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
- avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
- avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+ avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+ avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+ avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
- calc_load_update += n * LOAD_FREQ;
-}
-#else
-void calc_load_account_idle(struct rq *this_rq)
-{
-}
+ calc_load_update += n * LOAD_FREQ;
+ }
-static inline long calc_load_fold_idle(void)
-{
- return 0;
+ /*
+ * Flip the idle index...
+ *
+ * Make sure we first write the new time then flip the index, so that
+ * calc_load_write_idx() will see the new time when it reads the new
+ * index, this avoids a double flip messing things up.
+ */
+ smp_wmb();
+ calc_load_idx++;
}
+#else /* !CONFIG_NO_HZ */
-static void calc_global_nohz(void)
-{
-}
-#endif
+static inline long calc_load_fold_idle(void) { return 0; }
+static inline void calc_global_nohz(void) { }
-/**
- * get_avenrun - get the load average array
- * @loads: pointer to dest load array
- * @offset: offset to add
- * @shift: shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
- loads[0] = (avenrun[0] + offset) << shift;
- loads[1] = (avenrun[1] + offset) << shift;
- loads[2] = (avenrun[2] + offset) << shift;
-}
+#endif /* CONFIG_NO_HZ */
/*
* calc_load - update the avenrun load estimates 10 ticks after the
@@ -2370,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
*/
void calc_global_load(unsigned long ticks)
{
- long active;
+ long active, delta;
if (time_before(jiffies, calc_load_update + 10))
return;
+ /*
+ * Fold the 'old' idle-delta to include all NO_HZ cpus.
+ */
+ delta = calc_load_fold_idle();
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+
active = atomic_long_read(&calc_load_tasks);
active = active > 0 ? active * FIXED_1 : 0;
@@ -2385,12 +2516,7 @@ void calc_global_load(unsigned long ticks)
calc_load_update += LOAD_FREQ;
/*
- * Account one period with whatever state we found before
- * folding in the nohz state and ageing the entire idle period.
- *
- * This avoids loosing a sample when we go idle between
- * calc_load_account_active() (10 ticks ago) and now and thus
- * under-accounting.
+ * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
*/
calc_global_nohz();
}
@@ -2407,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq)
return;
delta = calc_load_fold_active(this_rq);
- delta += calc_load_fold_idle();
if (delta)
atomic_long_add(delta, &calc_load_tasks);
@@ -2415,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq)
}
/*
+ * End of global load-average stuff
+ */
+
+/*
* The exact cpuload at various idx values, calculated at every tick would be
* load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
*
@@ -2486,22 +2615,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
* scheduler tick (TICK_NSEC). With tickless idle this will not be called
* every tick. We fix it up based on jiffies.
*/
-void update_cpu_load(struct rq *this_rq)
+static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+ unsigned long pending_updates)
{
- unsigned long this_load = this_rq->load.weight;
- unsigned long curr_jiffies = jiffies;
- unsigned long pending_updates;
int i, scale;
this_rq->nr_load_updates++;
- /* Avoid repeated calls on same jiffy, when moving in and out of idle */
- if (curr_jiffies == this_rq->last_load_update_tick)
- return;
-
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
- this_rq->last_load_update_tick = curr_jiffies;
-
/* Update our load: */
this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2526,9 +2646,78 @@ void update_cpu_load(struct rq *this_rq)
sched_avg_update(this_rq);
}
+#ifdef CONFIG_NO_HZ
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
+/*
+ * Called from nohz_idle_balance() to update the load ratings before doing the
+ * idle balance.
+ */
+void update_idle_cpu_load(struct rq *this_rq)
+{
+ unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+ unsigned long load = this_rq->load.weight;
+ unsigned long pending_updates;
+
+ /*
+ * bail if there's load or we're actually up-to-date.
+ */
+ if (load || curr_jiffies == this_rq->last_load_update_tick)
+ return;
+
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ this_rq->last_load_update_tick = curr_jiffies;
+
+ __update_cpu_load(this_rq, load, pending_updates);
+}
+
+/*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+ struct rq *this_rq = this_rq();
+ unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+ unsigned long pending_updates;
+
+ if (curr_jiffies == this_rq->last_load_update_tick)
+ return;
+
+ raw_spin_lock(&this_rq->lock);
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ if (pending_updates) {
+ this_rq->last_load_update_tick = curr_jiffies;
+ /*
+ * We were idle, this means load 0, the current load might be
+ * !0 due to remote wakeups and the sort.
+ */
+ __update_cpu_load(this_rq, 0, pending_updates);
+ }
+ raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+
+/*
+ * Called from scheduler_tick()
+ */
static void update_cpu_load_active(struct rq *this_rq)
{
- update_cpu_load(this_rq);
+ /*
+ * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+ */
+ this_rq->last_load_update_tick = jiffies;
+ __update_cpu_load(this_rq, this_rq->load.weight, 1);
calc_load_account_active(this_rq);
}
@@ -3113,6 +3302,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
if (irqs_disabled())
print_irqtrace_events(prev);
dump_stack();
+ add_taint(TAINT_WARN);
}
/*
@@ -4042,11 +4232,8 @@ static bool check_same_owner(struct task_struct *p)
rcu_read_lock();
pcred = __task_cred(p);
- if (cred->user->user_ns == pcred->user->user_ns)
- match = (cred->euid == pcred->euid ||
- cred->euid == pcred->uid);
- else
- match = false;
+ match = (uid_eq(cred->euid, pcred->euid) ||
+ uid_eq(cred->euid, pcred->uid));
rcu_read_unlock();
return match;
}
@@ -4957,7 +5144,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
p->sched_class->set_cpus_allowed(p, new_mask);
cpumask_copy(&p->cpus_allowed, new_mask);
- p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+ p->nr_cpus_allowed = cpumask_weight(new_mask);
}
/*
@@ -5499,15 +5686,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
#ifdef CONFIG_SCHED_DEBUG
-static __read_mostly int sched_domain_debug_enabled;
+static __read_mostly int sched_debug_enabled;
-static int __init sched_domain_debug_setup(char *str)
+static int __init sched_debug_setup(char *str)
{
- sched_domain_debug_enabled = 1;
+ sched_debug_enabled = 1;
return 0;
}
-early_param("sched_debug", sched_domain_debug_setup);
+early_param("sched_debug", sched_debug_setup);
+
+static inline bool sched_debug(void)
+{
+ return sched_debug_enabled;
+}
static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
struct cpumask *groupmask)
@@ -5547,7 +5739,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (!group->sgp->power) {
+ /*
+ * Even though we initialize ->power to something semi-sane,
+ * we leave power_orig unset. This allows us to detect if
+ * domain iteration is still funny without causing /0 traps.
+ */
+ if (!group->sgp->power_orig) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: domain->cpu_power not "
"set\n");
@@ -5560,7 +5757,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
+ if (!(sd->flags & SD_OVERLAP) &&
+ cpumask_intersects(groupmask, sched_group_cpus(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: repeated CPUs\n");
break;
@@ -5594,7 +5792,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
{
int level = 0;
- if (!sched_domain_debug_enabled)
+ if (!sched_debug_enabled)
return;
if (!sd) {
@@ -5615,6 +5813,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
}
#else /* !CONFIG_SCHED_DEBUG */
# define sched_domain_debug(sd, cpu) do { } while (0)
+static inline bool sched_debug(void)
+{
+ return false;
+}
#endif /* CONFIG_SCHED_DEBUG */
static int sd_degenerate(struct sched_domain *sd)
@@ -5898,99 +6100,11 @@ static int __init isolated_cpu_setup(char *str)
__setup("isolcpus=", isolated_cpu_setup);
-#ifdef CONFIG_NUMA
-
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain. Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int find_next_best_node(int node, nodemask_t *used_nodes)
-{
- int i, n, val, min_val, best_node = -1;
-
- min_val = INT_MAX;
-
- for (i = 0; i < nr_node_ids; i++) {
- /* Start at @node */
- n = (node + i) % nr_node_ids;
-
- if (!nr_cpus_node(n))
- continue;
-
- /* Skip already used nodes */
- if (node_isset(n, *used_nodes))
- continue;
-
- /* Simple min distance search */
- val = node_distance(node, n);
-
- if (val < min_val) {
- min_val = val;
- best_node = n;
- }
- }
-
- if (best_node != -1)
- node_set(best_node, *used_nodes);
- return best_node;
-}
-
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @span: resulting cpumask
- *
- * Given a node, construct a good cpumask for its sched_domain to span. It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-static void sched_domain_node_span(int node, struct cpumask *span)
-{
- nodemask_t used_nodes;
- int i;
-
- cpumask_clear(span);
- nodes_clear(used_nodes);
-
- cpumask_or(span, span, cpumask_of_node(node));
- node_set(node, used_nodes);
-
- for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
- int next_node = find_next_best_node(node, &used_nodes);
- if (next_node < 0)
- break;
- cpumask_or(span, span, cpumask_of_node(next_node));
- }
-}
-
-static const struct cpumask *cpu_node_mask(int cpu)
-{
- lockdep_assert_held(&sched_domains_mutex);
-
- sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
-
- return sched_domains_tmpmask;
-}
-
-static const struct cpumask *cpu_allnodes_mask(int cpu)
-{
- return cpu_possible_mask;
-}
-#endif /* CONFIG_NUMA */
-
static const struct cpumask *cpu_cpu_mask(int cpu)
{
return cpumask_of_node(cpu_to_node(cpu));
}
-int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
-
struct sd_data {
struct sched_domain **__percpu sd;
struct sched_group **__percpu sg;
@@ -6020,9 +6134,48 @@ struct sched_domain_topology_level {
sched_domain_init_f init;
sched_domain_mask_f mask;
int flags;
+ int numa_level;
struct sd_data data;
};
+/*
+ * Build an iteration mask that can exclude certain CPUs from the upwards
+ * domain traversal.
+ *
+ * Asymmetric node setups can result in situations where the domain tree is of
+ * unequal depth, make sure to skip domains that already cover the entire
+ * range.
+ *
+ * In that case build_sched_domains() will have terminated the iteration early
+ * and our sibling sd spans will be empty. Domains should always include the
+ * cpu they're built on, so check that.
+ *
+ */
+static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+{
+ const struct cpumask *span = sched_domain_span(sd);
+ struct sd_data *sdd = sd->private;
+ struct sched_domain *sibling;
+ int i;
+
+ for_each_cpu(i, span) {
+ sibling = *per_cpu_ptr(sdd->sd, i);
+ if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+ continue;
+
+ cpumask_set_cpu(i, sched_group_mask(sg));
+ }
+}
+
+/*
+ * Return the canonical balance cpu for this group, this is the first cpu
+ * of this group that's also in the iteration mask.
+ */
+int group_balance_cpu(struct sched_group *sg)
+{
+ return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+}
+
static int
build_overlap_sched_groups(struct sched_domain *sd, int cpu)
{
@@ -6041,6 +6194,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
if (cpumask_test_cpu(i, covered))
continue;
+ child = *per_cpu_ptr(sdd->sd, i);
+
+ /* See the comment near build_group_mask(). */
+ if (!cpumask_test_cpu(i, sched_domain_span(child)))
+ continue;
+
sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
GFP_KERNEL, cpu_to_node(cpu));
@@ -6048,8 +6207,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
goto fail;
sg_span = sched_group_cpus(sg);
-
- child = *per_cpu_ptr(sdd->sd, i);
if (child->child) {
child = child->child;
cpumask_copy(sg_span, sched_domain_span(child));
@@ -6058,10 +6215,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
cpumask_or(covered, covered, sg_span);
- sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
- atomic_inc(&sg->sgp->ref);
+ sg->sgp = *per_cpu_ptr(sdd->sgp, i);
+ if (atomic_inc_return(&sg->sgp->ref) == 1)
+ build_group_mask(sd, sg);
+
+ /*
+ * Initialize sgp->power such that even if we mess up the
+ * domains and no possible iteration will get us here, we won't
+ * die on a /0 trap.
+ */
+ sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
- if (cpumask_test_cpu(cpu, sg_span))
+ /*
+ * Make sure the first group of this domain contains the
+ * canonical balance cpu. Otherwise the sched_domain iteration
+ * breaks. See update_sg_lb_stats().
+ */
+ if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
+ group_balance_cpu(sg) == cpu)
groups = sg;
if (!first)
@@ -6135,6 +6306,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
cpumask_clear(sched_group_cpus(sg));
sg->sgp->power = 0;
+ cpumask_setall(sched_group_mask(sg));
for_each_cpu(j, span) {
if (get_group(j, sdd, NULL) != group)
@@ -6176,7 +6348,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
sg = sg->next;
} while (sg != sd->groups);
- if (cpu != group_first_cpu(sg))
+ if (cpu != group_balance_cpu(sg))
return;
update_group_power(sd, cpu);
@@ -6211,10 +6383,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
}
SD_INIT_FUNC(CPU)
-#ifdef CONFIG_NUMA
- SD_INIT_FUNC(ALLNODES)
- SD_INIT_FUNC(NODE)
-#endif
#ifdef CONFIG_SCHED_SMT
SD_INIT_FUNC(SIBLING)
#endif
@@ -6230,11 +6398,8 @@ int sched_domain_level_max;
static int __init setup_relax_domain_level(char *str)
{
- unsigned long val;
-
- val = simple_strtoul(str, NULL, 0);
- if (val < sched_domain_level_max)
- default_relax_domain_level = val;
+ if (kstrtoint(str, 0, &default_relax_domain_level))
+ pr_warn("Unable to set relax_domain_level\n");
return 1;
}
@@ -6336,15 +6501,236 @@ static struct sched_domain_topology_level default_topology[] = {
{ sd_init_BOOK, cpu_book_mask, },
#endif
{ sd_init_CPU, cpu_cpu_mask, },
-#ifdef CONFIG_NUMA
- { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
- { sd_init_ALLNODES, cpu_allnodes_mask, },
-#endif
{ NULL, },
};
static struct sched_domain_topology_level *sched_domain_topology = default_topology;
+#ifdef CONFIG_NUMA
+
+static int sched_domains_numa_levels;
+static int *sched_domains_numa_distance;
+static struct cpumask ***sched_domains_numa_masks;
+static int sched_domains_curr_level;
+
+static inline int sd_local_flags(int level)
+{
+ if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
+ return 0;
+
+ return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
+}
+
+static struct sched_domain *
+sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
+{
+ struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
+ int level = tl->numa_level;
+ int sd_weight = cpumask_weight(
+ sched_domains_numa_masks[level][cpu_to_node(cpu)]);
+
+ *sd = (struct sched_domain){
+ .min_interval = sd_weight,
+ .max_interval = 2*sd_weight,
+ .busy_factor = 32,
+ .imbalance_pct = 125,
+ .cache_nice_tries = 2,
+ .busy_idx = 3,
+ .idle_idx = 2,
+ .newidle_idx = 0,
+ .wake_idx = 0,
+ .forkexec_idx = 0,
+
+ .flags = 1*SD_LOAD_BALANCE
+ | 1*SD_BALANCE_NEWIDLE
+ | 0*SD_BALANCE_EXEC
+ | 0*SD_BALANCE_FORK
+ | 0*SD_BALANCE_WAKE
+ | 0*SD_WAKE_AFFINE
+ | 0*SD_PREFER_LOCAL
+ | 0*SD_SHARE_CPUPOWER
+ | 0*SD_SHARE_PKG_RESOURCES
+ | 1*SD_SERIALIZE
+ | 0*SD_PREFER_SIBLING
+ | sd_local_flags(level)
+ ,
+ .last_balance = jiffies,
+ .balance_interval = sd_weight,
+ };
+ SD_INIT_NAME(sd, NUMA);
+ sd->private = &tl->data;
+
+ /*
+ * Ugly hack to pass state to sd_numa_mask()...
+ */
+ sched_domains_curr_level = tl->numa_level;
+
+ return sd;
+}
+
+static const struct cpumask *sd_numa_mask(int cpu)
+{
+ return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+}
+
+static void sched_numa_warn(const char *str)
+{
+ static int done = false;
+ int i,j;
+
+ if (done)
+ return;
+
+ done = true;
+
+ printk(KERN_WARNING "ERROR: %s\n\n", str);
+
+ for (i = 0; i < nr_node_ids; i++) {
+ printk(KERN_WARNING " ");
+ for (j = 0; j < nr_node_ids; j++)
+ printk(KERN_CONT "%02d ", node_distance(i,j));
+ printk(KERN_CONT "\n");
+ }
+ printk(KERN_WARNING "\n");
+}
+
+static bool find_numa_distance(int distance)
+{
+ int i;
+
+ if (distance == node_distance(0, 0))
+ return true;
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ if (sched_domains_numa_distance[i] == distance)
+ return true;
+ }
+
+ return false;
+}
+
+static void sched_init_numa(void)
+{
+ int next_distance, curr_distance = node_distance(0, 0);
+ struct sched_domain_topology_level *tl;
+ int level = 0;
+ int i, j, k;
+
+ sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+ if (!sched_domains_numa_distance)
+ return;
+
+ /*
+ * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+ * unique distances in the node_distance() table.
+ *
+ * Assumes node_distance(0,j) includes all distances in
+ * node_distance(i,j) in order to avoid cubic time.
+ */
+ next_distance = curr_distance;
+ for (i = 0; i < nr_node_ids; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ for (k = 0; k < nr_node_ids; k++) {
+ int distance = node_distance(i, k);
+
+ if (distance > curr_distance &&
+ (distance < next_distance ||
+ next_distance == curr_distance))
+ next_distance = distance;
+
+ /*
+ * While not a strong assumption it would be nice to know
+ * about cases where if node A is connected to B, B is not
+ * equally connected to A.
+ */
+ if (sched_debug() && node_distance(k, i) != distance)
+ sched_numa_warn("Node-distance not symmetric");
+
+ if (sched_debug() && i && !find_numa_distance(distance))
+ sched_numa_warn("Node-0 not representative");
+ }
+ if (next_distance != curr_distance) {
+ sched_domains_numa_distance[level++] = next_distance;
+ sched_domains_numa_levels = level;
+ curr_distance = next_distance;
+ } else break;
+ }
+
+ /*
+ * In case of sched_debug() we verify the above assumption.
+ */
+ if (!sched_debug())
+ break;
+ }
+ /*
+ * 'level' contains the number of unique distances, excluding the
+ * identity distance node_distance(i,i).
+ *
+ * The sched_domains_nume_distance[] array includes the actual distance
+ * numbers.
+ */
+
+ sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+ if (!sched_domains_numa_masks)
+ return;
+
+ /*
+ * Now for each level, construct a mask per node which contains all
+ * cpus of nodes that are that many hops away from us.
+ */
+ for (i = 0; i < level; i++) {
+ sched_domains_numa_masks[i] =
+ kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+ if (!sched_domains_numa_masks[i])
+ return;
+
+ for (j = 0; j < nr_node_ids; j++) {
+ struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
+ if (!mask)
+ return;
+
+ sched_domains_numa_masks[i][j] = mask;
+
+ for (k = 0; k < nr_node_ids; k++) {
+ if (node_distance(j, k) > sched_domains_numa_distance[i])
+ continue;
+
+ cpumask_or(mask, mask, cpumask_of_node(k));
+ }
+ }
+ }
+
+ tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
+ sizeof(struct sched_domain_topology_level), GFP_KERNEL);
+ if (!tl)
+ return;
+
+ /*
+ * Copy the default topology bits..
+ */
+ for (i = 0; default_topology[i].init; i++)
+ tl[i] = default_topology[i];
+
+ /*
+ * .. and append 'j' levels of NUMA goodness.
+ */
+ for (j = 0; j < level; i++, j++) {
+ tl[i] = (struct sched_domain_topology_level){
+ .init = sd_numa_init,
+ .mask = sd_numa_mask,
+ .flags = SDTL_OVERLAP,
+ .numa_level = j,
+ };
+ }
+
+ sched_domain_topology = tl;
+}
+#else
+static inline void sched_init_numa(void)
+{
+}
+#endif /* CONFIG_NUMA */
+
static int __sdt_alloc(const struct cpumask *cpu_map)
{
struct sched_domain_topology_level *tl;
@@ -6386,7 +6772,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
*per_cpu_ptr(sdd->sg, j) = sg;
- sgp = kzalloc_node(sizeof(struct sched_group_power),
+ sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
if (!sgp)
return -ENOMEM;
@@ -6439,7 +6825,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
if (!sd)
return child;
- set_domain_attribute(sd, attr);
cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
if (child) {
sd->level = child->level + 1;
@@ -6447,6 +6832,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
child->parent = sd;
}
sd->child = child;
+ set_domain_attribute(sd, attr);
return sd;
}
@@ -6587,7 +6973,6 @@ static int init_sched_domains(const struct cpumask *cpu_map)
if (!doms_cur)
doms_cur = &fallback_doms;
cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
- dattr_cur = NULL;
err = build_sched_domains(doms_cur[0], NULL);
register_sched_domain_sysctl();
@@ -6712,97 +7097,6 @@ match2:
mutex_unlock(&sched_domains_mutex);
}
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-static void reinit_sched_domains(void)
-{
- get_online_cpus();
-
- /* Destroy domains first to force the rebuild */
- partition_sched_domains(0, NULL, NULL);
-
- rebuild_sched_domains();
- put_online_cpus();
-}
-
-static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
-{
- unsigned int level = 0;
-
- if (sscanf(buf, "%u", &level) != 1)
- return -EINVAL;
-
- /*
- * level is always be positive so don't check for
- * level < POWERSAVINGS_BALANCE_NONE which is 0
- * What happens on 0 or 1 byte write,
- * need to check for count as well?
- */
-
- if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
- return -EINVAL;
-
- if (smt)
- sched_smt_power_savings = level;
- else
- sched_mc_power_savings = level;
-
- reinit_sched_domains();
-
- return count;
-}
-
-#ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct device *dev,
- struct device_attribute *attr,
- char *buf)
-{
- return sprintf(buf, "%u\n", sched_mc_power_savings);
-}
-static ssize_t sched_mc_power_savings_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- return sched_power_savings_store(buf, count, 0);
-}
-static DEVICE_ATTR(sched_mc_power_savings, 0644,
- sched_mc_power_savings_show,
- sched_mc_power_savings_store);
-#endif
-
-#ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct device *dev,
- struct device_attribute *attr,
- char *buf)
-{
- return sprintf(buf, "%u\n", sched_smt_power_savings);
-}
-static ssize_t sched_smt_power_savings_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- return sched_power_savings_store(buf, count, 1);
-}
-static DEVICE_ATTR(sched_smt_power_savings, 0644,
- sched_smt_power_savings_show,
- sched_smt_power_savings_store);
-#endif
-
-int __init sched_create_sysfs_power_savings_entries(struct device *dev)
-{
- int err = 0;
-
-#ifdef CONFIG_SCHED_SMT
- if (smt_capable())
- err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
-#endif
-#ifdef CONFIG_SCHED_MC
- if (!err && mc_capable())
- err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
-#endif
- return err;
-}
-#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-
/*
* Update cpusets according to cpu_active mask. If cpusets are
* disabled, cpuset_update_active_cpus() becomes a simple wrapper
@@ -6840,6 +7134,8 @@ void __init sched_init_smp(void)
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+ sched_init_numa();
+
get_online_cpus();
mutex_lock(&sched_domains_mutex);
init_sched_domains(cpu_active_mask);
@@ -7061,6 +7357,7 @@ void __init sched_init(void)
/* May be allocated at isolcpus cmdline parse time */
if (cpu_isolated_map == NULL)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
+ idle_thread_set_boot_cpu();
#endif
init_sched_fair_class();
@@ -7982,13 +8279,9 @@ static struct cftype cpu_files[] = {
.write_u64 = cpu_rt_period_write_uint,
},
#endif
+ { } /* terminate */
};
-static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
- return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
-}
-
struct cgroup_subsys cpu_cgroup_subsys = {
.name = "cpu",
.create = cpu_cgroup_create,
@@ -7996,8 +8289,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
- .populate = cpu_cgroup_populate,
.subsys_id = cpu_cgroup_subsys_id,
+ .base_cftypes = cpu_files,
.early_init = 1,
};
@@ -8182,13 +8475,9 @@ static struct cftype files[] = {
.name = "stat",
.read_map = cpuacct_stats_show,
},
+ { } /* terminate */
};
-static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
- return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
-}
-
/*
* charge this task's execution time to its accounting group.
*
@@ -8220,7 +8509,7 @@ struct cgroup_subsys cpuacct_subsys = {
.name = "cpuacct",
.create = cpuacct_create,
.destroy = cpuacct_destroy,
- .populate = cpuacct_populate,
.subsys_id = cpuacct_subsys_id,
+ .base_cftypes = files,
};
#endif /* CONFIG_CGROUP_CPUACCT */