summaryrefslogtreecommitdiff
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c412
1 files changed, 270 insertions, 142 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index bfb8ad8ed171..13d9a7b931e4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -70,10 +70,13 @@
#include <linux/bootmem.h>
#include <linux/debugfs.h>
#include <linux/ctype.h>
+#include <linux/ftrace.h>
#include <asm/tlb.h>
#include <asm/irq_regs.h>
+#include "sched_cpupri.h"
+
/*
* Convert user-nice values [ -20 ... 0 ... 19 ]
* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -151,7 +154,8 @@ static inline int task_has_rt_policy(struct task_struct *p)
*/
struct rt_prio_array {
DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
- struct list_head queue[MAX_RT_PRIO];
+ struct list_head xqueue[MAX_RT_PRIO]; /* exclusive queue */
+ struct list_head squeue[MAX_RT_PRIO]; /* shared queue */
};
struct rt_bandwidth {
@@ -289,15 +293,15 @@ struct task_group root_task_group;
static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
/* Default task group's cfs_rq on each cpu */
static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
-#endif
-#else
+#endif /* CONFIG_RT_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED */
#define root_task_group init_task_group
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
/* task_group_lock serializes add/remove of task groups and also changes to
* a task group's cpu shares.
@@ -307,17 +311,20 @@ static DEFINE_SPINLOCK(task_group_lock);
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_USER_SCHED
# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
-#else
+#else /* !CONFIG_USER_SCHED */
# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
-#endif
+#endif /* CONFIG_USER_SCHED */
/*
- * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems.
+ * A weight of 0 or 1 can cause arithmetics problems.
+ * A weight of a cfs_rq is the sum of weights of which entities
+ * are queued on this cfs_rq, so a weight of a entity should not be
+ * too large, so as the shares value of a task group.
* (The default weight is 1024 - so there's no practical
* limitation from this.)
*/
#define MIN_SHARES 2
-#define MAX_SHARES (ULONG_MAX - 1)
+#define MAX_SHARES (1UL << 18)
static int init_task_group_load = INIT_TASK_GROUP_LOAD;
#endif
@@ -449,6 +456,9 @@ struct root_domain {
*/
cpumask_t rto_mask;
atomic_t rto_count;
+#ifdef CONFIG_SMP
+ struct cpupri cpupri;
+#endif
};
/*
@@ -523,6 +533,7 @@ struct rq {
int push_cpu;
/* cpu of this runqueue: */
int cpu;
+ int online;
struct task_struct *migration_thread;
struct list_head migration_queue;
@@ -595,6 +606,8 @@ static inline void update_rq_clock(struct rq *rq)
rq->clock = sched_clock_cpu(cpu_of(rq));
}
+#include "sched_trace.h"
+
/*
* Tunables that become constants when CONFIG_SCHED_DEBUG is off:
*/
@@ -604,6 +617,24 @@ static inline void update_rq_clock(struct rq *rq)
# define const_debug static const
#endif
+/**
+ * runqueue_is_locked
+ *
+ * Returns true if the current cpu runqueue is locked.
+ * This interface allows printk to be called with the runqueue lock
+ * held and know whether or not it is OK to wake up the klogd.
+ */
+int runqueue_is_locked(void)
+{
+ int cpu = get_cpu();
+ struct rq *rq = cpu_rq(cpu);
+ int ret;
+
+ ret = spin_is_locked(&rq->lock);
+ put_cpu();
+ return ret;
+}
+
/*
* Debugging: various feature bits
*/
@@ -828,7 +859,7 @@ static unsigned long long __cpu_clock(int cpu)
* For kernel-internal use: high-speed (but slightly incorrect) per-cpu
* clock constructed from sched_clock():
*/
-unsigned long long cpu_clock(int cpu)
+unsigned long long notrace cpu_clock(int cpu)
{
unsigned long long prev_cpu_time, time, delta_time;
unsigned long flags;
@@ -1124,6 +1155,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
+#ifdef CONFIG_SMP
static void hotplug_hrtick_disable(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -1179,6 +1211,7 @@ static void init_hrtick(void)
{
hotcpu_notifier(hotplug_hrtick, 0);
}
+#endif /* CONFIG_SMP */
static void init_rq_hrtick(struct rq *rq)
{
@@ -1308,15 +1341,15 @@ void wake_up_idle_cpu(int cpu)
if (!tsk_is_polling(rq->idle))
smp_send_reschedule(cpu);
}
-#endif
+#endif /* CONFIG_NO_HZ */
-#else
+#else /* !CONFIG_SMP */
static void __resched_task(struct task_struct *p, int tif_bit)
{
assert_spin_locked(&task_rq(p)->lock);
set_tsk_thread_flag(p, tif_bit);
}
-#endif
+#endif /* CONFIG_SMP */
#if BITS_PER_LONG == 32
# define WMULT_CONST (~0UL)
@@ -1337,8 +1370,13 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
{
u64 tmp;
- if (!lw->inv_weight)
- lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)/(lw->weight+1);
+ if (!lw->inv_weight) {
+ if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
+ lw->inv_weight = 1;
+ else
+ lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
+ / (lw->weight+1);
+ }
tmp = (u64)delta_exec * weight;
/*
@@ -1471,16 +1509,8 @@ static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
static unsigned long cpu_avg_load_per_task(int cpu);
static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-#else /* CONFIG_SMP */
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
-{
-}
#endif
-#endif /* CONFIG_SMP */
-
#include "sched_stats.h"
#include "sched_idletask.c"
#include "sched_fair.c"
@@ -1490,6 +1520,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
#endif
#define sched_class_highest (&rt_sched_class)
+#define for_each_class(class) \
+ for (class = sched_class_highest; class; class = class->next)
static inline void inc_load(struct rq *rq, const struct task_struct *p)
{
@@ -1626,12 +1658,6 @@ inline int task_curr(const struct task_struct *p)
return cpu_curr(task_cpu(p)) == p;
}
-/* Used instead of source_load when we know the type == 0 */
-unsigned long weighted_cpuload(const int cpu)
-{
- return cpu_rq(cpu)->load.weight;
-}
-
static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
{
set_task_rq(p, cpu);
@@ -1660,6 +1686,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
#ifdef CONFIG_SMP
+/* Used instead of source_load when we know the type == 0 */
+static unsigned long weighted_cpuload(const int cpu)
+{
+ return cpu_rq(cpu)->load.weight;
+}
+
/*
* Is this task likely cache-hot:
*/
@@ -1796,6 +1828,7 @@ void wait_task_inactive(struct task_struct *p)
* just go back and repeat.
*/
rq = task_rq_lock(p, &flags);
+ trace_kernel_sched_wait(p);
running = task_running(rq, p);
on_rq = p->se.on_rq;
task_rq_unlock(rq, &flags);
@@ -1929,7 +1962,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
/* Tally up the load of all CPUs in the group */
avg_load = 0;
- for_each_cpu_mask(i, group->cpumask) {
+ for_each_cpu_mask_nr(i, group->cpumask) {
/* Bias balancing toward cpus of our domain */
if (local_group)
load = source_load(i, load_idx);
@@ -1971,7 +2004,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
/* Traverse only the allowed CPUs */
cpus_and(*tmp, group->cpumask, p->cpus_allowed);
- for_each_cpu_mask(i, *tmp) {
+ for_each_cpu_mask_nr(i, *tmp) {
load = weighted_cpuload(i);
if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -2121,7 +2154,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
}
}
}
-#endif
+#endif /* CONFIG_SCHEDSTATS */
out_activate:
#endif /* CONFIG_SMP */
@@ -2139,6 +2172,7 @@ out_activate:
success = 1;
out_running:
+ trace_kernel_sched_wakeup(rq, p);
check_preempt_curr(rq, p);
p->state = TASK_RUNNING;
@@ -2269,6 +2303,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
p->sched_class->task_new(rq, p);
inc_nr_running(p, rq);
}
+ trace_kernel_sched_wakeup_new(rq, p);
check_preempt_curr(rq, p);
#ifdef CONFIG_SMP
if (p->sched_class->task_wake_up)
@@ -2321,7 +2356,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
notifier->ops->sched_out(notifier, next);
}
-#else
+#else /* !CONFIG_PREEMPT_NOTIFIERS */
static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
@@ -2333,7 +2368,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
{
}
-#endif
+#endif /* CONFIG_PREEMPT_NOTIFIERS */
/**
* prepare_task_switch - prepare to switch tasks
@@ -2441,6 +2476,8 @@ context_switch(struct rq *rq, struct task_struct *prev,
struct mm_struct *mm, *oldmm;
prepare_task_switch(rq, prev, next);
+
+ trace_kernel_sched_switch(rq, prev, next);
mm = next->mm;
oldmm = prev->active_mm;
/*
@@ -2673,6 +2710,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
|| unlikely(cpu_is_offline(dest_cpu)))
goto out;
+ trace_kernel_sched_migrate_task(p, cpu_of(rq), dest_cpu);
/* force the process onto the specified CPU */
if (migrate_task(p, dest_cpu, &req)) {
/* Need to wait for migration thread (might exit: take ref). */
@@ -2954,7 +2992,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
max_cpu_load = 0;
min_cpu_load = ~0UL;
- for_each_cpu_mask(i, group->cpumask) {
+ for_each_cpu_mask_nr(i, group->cpumask) {
struct rq *rq;
if (!cpu_isset(i, *cpus))
@@ -3218,7 +3256,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
unsigned long max_load = 0;
int i;
- for_each_cpu_mask(i, group->cpumask) {
+ for_each_cpu_mask_nr(i, group->cpumask) {
unsigned long wl;
if (!cpu_isset(i, *cpus))
@@ -3662,6 +3700,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
+ int need_serialize;
cpumask_t tmp;
for_each_domain(cpu, sd) {
@@ -3679,8 +3718,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
if (interval > HZ*NR_CPUS/10)
interval = HZ*NR_CPUS/10;
+ need_serialize = sd->flags & SD_SERIALIZE;
- if (sd->flags & SD_SERIALIZE) {
+ if (need_serialize) {
if (!spin_trylock(&balancing))
goto out;
}
@@ -3696,7 +3736,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
}
sd->last_balance = jiffies;
}
- if (sd->flags & SD_SERIALIZE)
+ if (need_serialize)
spin_unlock(&balancing);
out:
if (time_after(next_balance, sd->last_balance + interval)) {
@@ -3749,7 +3789,7 @@ static void run_rebalance_domains(struct softirq_action *h)
int balance_cpu;
cpu_clear(this_cpu, cpus);
- for_each_cpu_mask(balance_cpu, cpus) {
+ for_each_cpu_mask_nr(balance_cpu, cpus) {
/*
* If this cpu gets work to do, stop the load balancing
* work being done for other cpus. Next load
@@ -4011,26 +4051,44 @@ void scheduler_tick(void)
#endif
}
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+ defined(CONFIG_PREEMPT_TRACER))
+
+static inline unsigned long get_parent_ip(unsigned long addr)
+{
+ if (in_lock_functions(addr)) {
+ addr = CALLER_ADDR2;
+ if (in_lock_functions(addr))
+ addr = CALLER_ADDR3;
+ }
+ return addr;
+}
void __kprobes add_preempt_count(int val)
{
+#ifdef CONFIG_DEBUG_PREEMPT
/*
* Underflow?
*/
if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
return;
+#endif
preempt_count() += val;
+#ifdef CONFIG_DEBUG_PREEMPT
/*
* Spinlock count overflowing soon?
*/
DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
PREEMPT_MASK - 10);
+#endif
+ if (preempt_count() == val)
+ trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
}
EXPORT_SYMBOL(add_preempt_count);
void __kprobes sub_preempt_count(int val)
{
+#ifdef CONFIG_DEBUG_PREEMPT
/*
* Underflow?
*/
@@ -4042,7 +4100,10 @@ void __kprobes sub_preempt_count(int val)
if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
!(preempt_count() & PREEMPT_MASK)))
return;
+#endif
+ if (preempt_count() == val)
+ trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
preempt_count() -= val;
}
EXPORT_SYMBOL(sub_preempt_count);
@@ -4060,6 +4121,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
prev->comm, prev->pid, preempt_count());
debug_show_held_locks(prev);
+ print_modules();
if (irqs_disabled())
print_irqtrace_events(prev);
@@ -4133,7 +4195,7 @@ asmlinkage void __sched schedule(void)
struct task_struct *prev, *next;
unsigned long *switch_count;
struct rq *rq;
- int cpu;
+ int cpu, hrtick = sched_feat(HRTICK);
need_resched:
preempt_disable();
@@ -4148,7 +4210,8 @@ need_resched_nonpreemptible:
schedule_debug(prev);
- hrtick_clear(rq);
+ if (hrtick)
+ hrtick_clear(rq);
/*
* Do the rq-clock update outside the rq lock:
@@ -4159,12 +4222,10 @@ need_resched_nonpreemptible:
clear_tsk_need_resched(prev);
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
- if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
- signal_pending(prev))) {
+ if (unlikely(signal_pending_state(prev->state, prev)))
prev->state = TASK_RUNNING;
- } else {
+ else
deactivate_task(rq, prev, 1);
- }
switch_count = &prev->nvcsw;
}
@@ -4196,7 +4257,8 @@ need_resched_nonpreemptible:
} else
spin_unlock_irq(&rq->lock);
- hrtick_set(rq);
+ if (hrtick)
+ hrtick_set(rq);
if (unlikely(reacquire_kernel_lock(current) < 0))
goto need_resched_nonpreemptible;
@@ -5064,24 +5126,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
return sched_setaffinity(pid, &new_mask);
}
-/*
- * Represents all cpu's present in the system
- * In systems capable of hotplug, this map could dynamically grow
- * as new cpu's are detected in the system via any platform specific
- * method, such as ACPI for e.g.
- */
-
-cpumask_t cpu_present_map __read_mostly;
-EXPORT_SYMBOL(cpu_present_map);
-
-#ifndef CONFIG_SMP
-cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
-EXPORT_SYMBOL(cpu_online_map);
-
-cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
-EXPORT_SYMBOL(cpu_possible_map);
-#endif
-
long sched_getaffinity(pid_t pid, cpumask_t *mask)
{
struct task_struct *p;
@@ -5378,7 +5422,7 @@ out_unlock:
return retval;
}
-static const char stat_nam[] = "RSDTtZX";
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
void sched_show_task(struct task_struct *p)
{
@@ -5400,12 +5444,7 @@ void sched_show_task(struct task_struct *p)
printk(KERN_CONT " %016lx ", thread_saved_pc(p));
#endif
#ifdef CONFIG_DEBUG_STACK_USAGE
- {
- unsigned long *n = end_of_stack(p);
- while (!*n)
- n++;
- free = (unsigned long)n - (unsigned long)end_of_stack(p);
- }
+ free = stack_not_used(p);
#endif
printk(KERN_CONT "%5lu %5d %6d\n", free,
task_pid_nr(p), task_pid_nr(p->real_parent));
@@ -5565,6 +5604,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
goto out;
}
+ if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
+ !cpus_equal(p->cpus_allowed, *new_mask))) {
+ ret = -EINVAL;
+ goto out;
+ }
+
if (p->sched_class->set_cpus_allowed)
p->sched_class->set_cpus_allowed(p, new_mask);
else {
@@ -6052,6 +6097,36 @@ static void unregister_sched_domain_sysctl(void)
}
#endif
+static void set_rq_online(struct rq *rq)
+{
+ if (!rq->online) {
+ const struct sched_class *class;
+
+ cpu_set(rq->cpu, rq->rd->online);
+ rq->online = 1;
+
+ for_each_class(class) {
+ if (class->rq_online)
+ class->rq_online(rq);
+ }
+ }
+}
+
+static void set_rq_offline(struct rq *rq)
+{
+ if (rq->online) {
+ const struct sched_class *class;
+
+ for_each_class(class) {
+ if (class->rq_offline)
+ class->rq_offline(rq);
+ }
+
+ cpu_clear(rq->cpu, rq->rd->online);
+ rq->online = 0;
+ }
+}
+
/*
* migration_call - callback that gets triggered when a CPU is added.
* Here we can start up the necessary migration thread for the new CPU.
@@ -6089,7 +6164,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) {
BUG_ON(!cpu_isset(cpu, rq->rd->span));
- cpu_set(cpu, rq->rd->online);
+
+ set_rq_online(rq);
}
spin_unlock_irqrestore(&rq->lock, flags);
break;
@@ -6150,7 +6226,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) {
BUG_ON(!cpu_isset(cpu, rq->rd->span));
- cpu_clear(cpu, rq->rd->online);
+ set_rq_offline(rq);
}
spin_unlock_irqrestore(&rq->lock, flags);
break;
@@ -6184,6 +6260,28 @@ void __init migration_init(void)
#ifdef CONFIG_SCHED_DEBUG
+static inline const char *sd_level_to_string(enum sched_domain_level lvl)
+{
+ switch (lvl) {
+ case SD_LV_NONE:
+ return "NONE";
+ case SD_LV_SIBLING:
+ return "SIBLING";
+ case SD_LV_MC:
+ return "MC";
+ case SD_LV_CPU:
+ return "CPU";
+ case SD_LV_NODE:
+ return "NODE";
+ case SD_LV_ALLNODES:
+ return "ALLNODES";
+ case SD_LV_MAX:
+ return "MAX";
+
+ }
+ return "MAX";
+}
+
static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpumask_t *groupmask)
{
@@ -6203,7 +6301,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
return -1;
}
- printk(KERN_CONT "span %s\n", str);
+ printk(KERN_CONT "span %s level %s\n",
+ str, sd_level_to_string(sd->level));
if (!cpu_isset(cpu, sd->span)) {
printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6287,9 +6386,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
}
kfree(groupmask);
}
-#else
+#else /* !CONFIG_SCHED_DEBUG */
# define sched_domain_debug(sd, cpu) do { } while (0)
-#endif
+#endif /* CONFIG_SCHED_DEBUG */
static int sd_degenerate(struct sched_domain *sd)
{
@@ -6349,20 +6448,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
static void rq_attach_root(struct rq *rq, struct root_domain *rd)
{
unsigned long flags;
- const struct sched_class *class;
spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) {
struct root_domain *old_rd = rq->rd;
- for (class = sched_class_highest; class; class = class->next) {
- if (class->leave_domain)
- class->leave_domain(rq);
- }
+ if (cpu_isset(rq->cpu, old_rd->online))
+ set_rq_offline(rq);
cpu_clear(rq->cpu, old_rd->span);
- cpu_clear(rq->cpu, old_rd->online);
if (atomic_dec_and_test(&old_rd->refcount))
kfree(old_rd);
@@ -6373,12 +6468,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
cpu_set(rq->cpu, rd->span);
if (cpu_isset(rq->cpu, cpu_online_map))
- cpu_set(rq->cpu, rd->online);
-
- for (class = sched_class_highest; class; class = class->next) {
- if (class->join_domain)
- class->join_domain(rq);
- }
+ set_rq_online(rq);
spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -6389,6 +6479,8 @@ static void init_rootdomain(struct root_domain *rd)
cpus_clear(rd->span);
cpus_clear(rd->online);
+
+ cpupri_init(&rd->cpupri);
}
static void init_defrootdomain(void)
@@ -6484,7 +6576,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
cpus_clear(*covered);
- for_each_cpu_mask(i, *span) {
+ for_each_cpu_mask_nr(i, *span) {
struct sched_group *sg;
int group = group_fn(i, cpu_map, &sg, tmpmask);
int j;
@@ -6495,7 +6587,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
cpus_clear(sg->cpumask);
sg->__cpu_power = 0;
- for_each_cpu_mask(j, *span) {
+ for_each_cpu_mask_nr(j, *span) {
if (group_fn(j, cpu_map, NULL, tmpmask) != group)
continue;
@@ -6531,9 +6623,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
min_val = INT_MAX;
- for (i = 0; i < MAX_NUMNODES; i++) {
+ for (i = 0; i < nr_node_ids; i++) {
/* Start at @node */
- n = (node + i) % MAX_NUMNODES;
+ n = (node + i) % nr_node_ids;
if (!nr_cpus_node(n))
continue;
@@ -6583,7 +6675,7 @@ static void sched_domain_node_span(int node, cpumask_t *span)
cpus_or(*span, *span, *nodemask);
}
}
-#endif
+#endif /* CONFIG_NUMA */
int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
@@ -6602,7 +6694,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
*sg = &per_cpu(sched_group_cpus, cpu);
return cpu;
}
-#endif
+#endif /* CONFIG_SCHED_SMT */
/*
* multi-core sched-domains:
@@ -6610,7 +6702,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
#ifdef CONFIG_SCHED_MC
static DEFINE_PER_CPU(struct sched_domain, core_domains);
static DEFINE_PER_CPU(struct sched_group, sched_group_core);
-#endif
+#endif /* CONFIG_SCHED_MC */
#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
static int
@@ -6695,7 +6787,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
if (!sg)
return;
do {
- for_each_cpu_mask(j, sg->cpumask) {
+ for_each_cpu_mask_nr(j, sg->cpumask) {
struct sched_domain *sd;
sd = &per_cpu(phys_domains, j);
@@ -6712,7 +6804,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
sg = sg->next;
} while (sg != group_head);
}
-#endif
+#endif /* CONFIG_NUMA */
#ifdef CONFIG_NUMA
/* Free memory allocated for various sched_group structures */
@@ -6720,14 +6812,14 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
{
int cpu, i;
- for_each_cpu_mask(cpu, *cpu_map) {
+ for_each_cpu_mask_nr(cpu, *cpu_map) {
struct sched_group **sched_group_nodes
= sched_group_nodes_bycpu[cpu];
if (!sched_group_nodes)
continue;
- for (i = 0; i < MAX_NUMNODES; i++) {
+ for (i = 0; i < nr_node_ids; i++) {
struct sched_group *oldsg, *sg = sched_group_nodes[i];
*nodemask = node_to_cpumask(i);
@@ -6749,11 +6841,11 @@ next_sg:
sched_group_nodes_bycpu[cpu] = NULL;
}
}
-#else
+#else /* !CONFIG_NUMA */
static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
{
}
-#endif
+#endif /* CONFIG_NUMA */
/*
* Initialize sched groups cpu_power.
@@ -6871,7 +6963,12 @@ static int default_relax_domain_level = -1;
static int __init setup_relax_domain_level(char *str)
{
- default_relax_domain_level = simple_strtoul(str, NULL, 0);
+ unsigned long val;
+
+ val = simple_strtoul(str, NULL, 0);
+ if (val < SD_LV_MAX)
+ default_relax_domain_level = val;
+
return 1;
}
__setup("relax_domain_level=", setup_relax_domain_level);
@@ -6915,7 +7012,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
/*
* Allocate the per-node list of sched groups
*/
- sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *),
+ sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
GFP_KERNEL);
if (!sched_group_nodes) {
printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -6954,7 +7051,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
/*
* Set up domains for cpus specified by the cpu_map.
*/
- for_each_cpu_mask(i, *cpu_map) {
+ for_each_cpu_mask_nr(i, *cpu_map) {
struct sched_domain *sd = NULL, *p;
SCHED_CPUMASK_VAR(nodemask, allmasks);
@@ -7021,7 +7118,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
#ifdef CONFIG_SCHED_SMT
/* Set up CPU (sibling) groups */
- for_each_cpu_mask(i, *cpu_map) {
+ for_each_cpu_mask_nr(i, *cpu_map) {
SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
SCHED_CPUMASK_VAR(send_covered, allmasks);
@@ -7038,7 +7135,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
#ifdef CONFIG_SCHED_MC
/* Set up multi-core groups */
- for_each_cpu_mask(i, *cpu_map) {
+ for_each_cpu_mask_nr(i, *cpu_map) {
SCHED_CPUMASK_VAR(this_core_map, allmasks);
SCHED_CPUMASK_VAR(send_covered, allmasks);
@@ -7054,7 +7151,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
#endif
/* Set up physical groups */
- for (i = 0; i < MAX_NUMNODES; i++) {
+ for (i = 0; i < nr_node_ids; i++) {
SCHED_CPUMASK_VAR(nodemask, allmasks);
SCHED_CPUMASK_VAR(send_covered, allmasks);
@@ -7078,7 +7175,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
send_covered, tmpmask);
}
- for (i = 0; i < MAX_NUMNODES; i++) {
+ for (i = 0; i < nr_node_ids; i++) {
/* Set up node groups */
struct sched_group *sg, *prev;
SCHED_CPUMASK_VAR(nodemask, allmasks);
@@ -7105,7 +7202,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
goto error;
}
sched_group_nodes[i] = sg;
- for_each_cpu_mask(j, *nodemask) {
+ for_each_cpu_mask_nr(j, *nodemask) {
struct sched_domain *sd;
sd = &per_cpu(node_domains, j);
@@ -7117,9 +7214,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
cpus_or(*covered, *covered, *nodemask);
prev = sg;
- for (j = 0; j < MAX_NUMNODES; j++) {
+ for (j = 0; j < nr_node_ids; j++) {
SCHED_CPUMASK_VAR(notcovered, allmasks);
- int n = (i + j) % MAX_NUMNODES;
+ int n = (i + j) % nr_node_ids;
node_to_cpumask_ptr(pnodemask, n);
cpus_complement(*notcovered, *covered);
@@ -7151,28 +7248,28 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
/* Calculate CPU power for physical packages and nodes */
#ifdef CONFIG_SCHED_SMT
- for_each_cpu_mask(i, *cpu_map) {
+ for_each_cpu_mask_nr(i, *cpu_map) {
struct sched_domain *sd = &per_cpu(cpu_domains, i);
init_sched_groups_power(i, sd);
}
#endif
#ifdef CONFIG_SCHED_MC
- for_each_cpu_mask(i, *cpu_map) {
+ for_each_cpu_mask_nr(i, *cpu_map) {
struct sched_domain *sd = &per_cpu(core_domains, i);
init_sched_groups_power(i, sd);
}
#endif
- for_each_cpu_mask(i, *cpu_map) {
+ for_each_cpu_mask_nr(i, *cpu_map) {
struct sched_domain *sd = &per_cpu(phys_domains, i);
init_sched_groups_power(i, sd);
}
#ifdef CONFIG_NUMA
- for (i = 0; i < MAX_NUMNODES; i++)
+ for (i = 0; i < nr_node_ids; i++)
init_numa_sched_groups_power(sched_group_nodes[i]);
if (sd_allnodes) {
@@ -7185,7 +7282,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
#endif
/* Attach the domains */
- for_each_cpu_mask(i, *cpu_map) {
+ for_each_cpu_mask_nr(i, *cpu_map) {
struct sched_domain *sd;
#ifdef CONFIG_SCHED_SMT
sd = &per_cpu(cpu_domains, i);
@@ -7230,6 +7327,18 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
}
/*
+ * Free current domain masks.
+ * Called after all cpus are attached to NULL domain.
+ */
+static void free_sched_domains(void)
+{
+ ndoms_cur = 0;
+ if (doms_cur != &fallback_doms)
+ kfree(doms_cur);
+ doms_cur = &fallback_doms;
+}
+
+/*
* Set up scheduler domains and groups. Callers must hold the hotplug lock.
* For now this just excludes isolated cpus, but could be used to
* exclude other special cases in the future.
@@ -7268,7 +7377,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
unregister_sched_domain_sysctl();
- for_each_cpu_mask(i, *cpu_map)
+ for_each_cpu_mask_nr(i, *cpu_map)
cpu_attach_domain(NULL, &def_root_domain, i);
synchronize_sched();
arch_destroy_sched_domains(cpu_map, &tmpmask);
@@ -7376,6 +7485,7 @@ int arch_reinit_sched_domains(void)
get_online_cpus();
mutex_lock(&sched_domains_mutex);
detach_destroy_domains(&cpu_online_map);
+ free_sched_domains();
err = arch_init_sched_domains(&cpu_online_map);
mutex_unlock(&sched_domains_mutex);
put_online_cpus();
@@ -7444,7 +7554,7 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
#endif
return err;
}
-#endif
+#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
/*
* Force a reinitialization of the sched domains hierarchy. The domains
@@ -7455,20 +7565,28 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
static int update_sched_domains(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
+ int cpu = (int)(long)hcpu;
+
switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
+ disable_runtime(cpu_rq(cpu));
+ /* fall-through */
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
detach_destroy_domains(&cpu_online_map);
+ free_sched_domains();
return NOTIFY_OK;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
+
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
+ enable_runtime(cpu_rq(cpu));
+ /* fall-through */
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
/*
@@ -7479,8 +7597,16 @@ static int update_sched_domains(struct notifier_block *nfb,
return NOTIFY_DONE;
}
+#ifndef CONFIG_CPUSETS
+ /*
+ * Create default domain partitioning if cpusets are disabled.
+ * Otherwise we let cpusets rebuild the domains based on the
+ * current setup.
+ */
+
/* The hotplug lock is already held by cpu_up/cpu_down */
arch_init_sched_domains(&cpu_online_map);
+#endif
return NOTIFY_OK;
}
@@ -7542,7 +7668,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
array = &rt_rq->active;
for (i = 0; i < MAX_RT_PRIO; i++) {
- INIT_LIST_HEAD(array->queue + i);
+ INIT_LIST_HEAD(array->xqueue + i);
+ INIT_LIST_HEAD(array->squeue + i);
__clear_bit(i, array->bitmap);
}
/* delimiter for bitsearch: */
@@ -7661,8 +7788,8 @@ void __init sched_init(void)
root_task_group.cfs_rq = (struct cfs_rq **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
-#endif
-#endif
+#endif /* CONFIG_USER_SCHED */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
init_task_group.rt_se = (struct sched_rt_entity **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
@@ -7676,8 +7803,8 @@ void __init sched_init(void)
root_task_group.rt_rq = (struct rt_rq **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
-#endif
-#endif
+#endif /* CONFIG_USER_SCHED */
+#endif /* CONFIG_RT_GROUP_SCHED */
}
#ifdef CONFIG_SMP
@@ -7693,8 +7820,8 @@ void __init sched_init(void)
#ifdef CONFIG_USER_SCHED
init_rt_bandwidth(&root_task_group.rt_bandwidth,
global_rt_period(), RUNTIME_INF);
-#endif
-#endif
+#endif /* CONFIG_USER_SCHED */
+#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_GROUP_SCHED
list_add(&init_task_group.list, &task_groups);
@@ -7704,8 +7831,8 @@ void __init sched_init(void)
INIT_LIST_HEAD(&root_task_group.children);
init_task_group.parent = &root_task_group;
list_add(&init_task_group.siblings, &root_task_group.children);
-#endif
-#endif
+#endif /* CONFIG_USER_SCHED */
+#endif /* CONFIG_GROUP_SCHED */
for_each_possible_cpu(i) {
struct rq *rq;
@@ -7785,6 +7912,7 @@ void __init sched_init(void)
rq->next_balance = jiffies;
rq->push_cpu = 0;
rq->cpu = i;
+ rq->online = 0;
rq->migration_thread = NULL;
INIT_LIST_HEAD(&rq->migration_queue);
rq_attach_root(rq, &def_root_domain);
@@ -7800,7 +7928,7 @@ void __init sched_init(void)
#endif
#ifdef CONFIG_SMP
- open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
+ open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
#endif
#ifdef CONFIG_RT_MUTEXES
@@ -8024,7 +8152,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
{
list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
}
-#else
+#else /* !CONFG_FAIR_GROUP_SCHED */
static inline void free_fair_sched_group(struct task_group *tg)
{
}
@@ -8042,7 +8170,7 @@ static inline void register_fair_sched_group(struct task_group *tg, int cpu)
static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
{
}
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
static void free_rt_sched_group(struct task_group *tg)
@@ -8113,7 +8241,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
{
list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
}
-#else
+#else /* !CONFIG_RT_GROUP_SCHED */
static inline void free_rt_sched_group(struct task_group *tg)
{
}
@@ -8131,7 +8259,7 @@ static inline void register_rt_sched_group(struct task_group *tg, int cpu)
static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
{
}
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_GROUP_SCHED
static void free_sched_group(struct task_group *tg)
@@ -8242,7 +8370,7 @@ void sched_move_task(struct task_struct *tsk)
task_rq_unlock(rq, &flags);
}
-#endif
+#endif /* CONFIG_GROUP_SCHED */
#ifdef CONFIG_FAIR_GROUP_SCHED
static void set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -8492,7 +8620,7 @@ static int sched_rt_global_constraints(void)
return ret;
}
-#else
+#else /* !CONFIG_RT_GROUP_SCHED */
static int sched_rt_global_constraints(void)
{
unsigned long flags;
@@ -8510,7 +8638,7 @@ static int sched_rt_global_constraints(void)
return 0;
}
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
int sched_rt_handler(struct ctl_table *table, int write,
struct file *filp, void __user *buffer, size_t *lenp,
@@ -8618,7 +8746,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
return (u64) tg->shares;
}
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
@@ -8642,7 +8770,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
{
return sched_group_rt_period(cgroup_tg(cgrp));
}
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
static struct cftype cpu_files[] = {
#ifdef CONFIG_FAIR_GROUP_SCHED