summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2008-06-13 12:20:38 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2008-06-13 12:40:07 +1000
commit6ae696b3e3623ef3307225ddf5e5ae60758781af (patch)
tree05f4ca0f41aeea5366d010491cf9193d50b55028 /kernel
parent8410f467dba6ef08c785c1f0a969018750652a40 (diff)
parent8c93f8a3eb71bf7b3e1909e2f005f24e487af610 (diff)
Merge commit 'sched/auto-sched-next'
Conflicts: kernel/Makefile
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/cpu.c24
-rw-r--r--kernel/cpuset.c20
-rw-r--r--kernel/kthread.c1
-rw-r--r--kernel/pid.c1
-rw-r--r--kernel/rcuclassic.c30
-rw-r--r--kernel/rcupdate.c71
-rw-r--r--kernel/rcupreempt.c417
-rw-r--r--kernel/rcupreempt_trace.c1
-rw-r--r--kernel/rcutorture.c34
-rw-r--r--kernel/sched.c266
-rw-r--r--kernel/sched_cpupri.c174
-rw-r--r--kernel/sched_cpupri.h36
-rw-r--r--kernel/sched_fair.c19
-rw-r--r--kernel/sched_features.h2
-rw-r--r--kernel/sched_rt.c305
16 files changed, 1127 insertions, 279 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index ca2433e84873..e66a541d8cf8 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -3,7 +3,7 @@
#
obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
- exit.o itimer.o time.o softirq.o resource.o \
+ cpu.o exit.o itimer.o time.o softirq.o resource.o \
sysctl.o capability.o ptrace.o timer.o user.o \
signal.o sys.o kmod.o workqueue.o pid.o \
rcupdate.o extable.o params.o posix-timers.o \
@@ -39,7 +39,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
-obj-$(CONFIG_SMP) += cpu.o spinlock.o
+obj-$(CONFIG_SMP) += spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_UID16) += uid16.o
@@ -83,6 +83,7 @@ obj-$(CONFIG_MARKERS) += marker.o
obj-$(CONFIG_LATENCYTOP) += latencytop.o
obj-$(CONFIG_FTRACE) += trace/
obj-$(CONFIG_TRACING) += trace/
+obj-$(CONFIG_SMP) += sched_cpupri.o
ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 50ae922c6022..b21ba7e9036d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,6 +15,28 @@
#include <linux/stop_machine.h>
#include <linux/mutex.h>
+/*
+ * Represents all cpu's present in the system
+ * In systems capable of hotplug, this map could dynamically grow
+ * as new cpu's are detected in the system via any platform specific
+ * method, such as ACPI for e.g.
+ */
+cpumask_t cpu_present_map __read_mostly;
+EXPORT_SYMBOL(cpu_present_map);
+
+#ifndef CONFIG_SMP
+
+/*
+ * Represents all cpu's that are currently online.
+ */
+cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
+EXPORT_SYMBOL(cpu_online_map);
+
+cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
+EXPORT_SYMBOL(cpu_possible_map);
+
+#else /* CONFIG_SMP */
+
/* Serializes the updates to cpu_online_map, cpu_present_map */
static DEFINE_MUTEX(cpu_add_remove_lock);
@@ -403,3 +425,5 @@ out:
cpu_maps_update_done();
}
#endif /* CONFIG_PM_SLEEP_SMP */
+
+#endif /* CONFIG_SMP */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 66103a119bfe..64a05da9bc4c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1194,6 +1194,15 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
return -ENOSPC;
+ if (tsk->flags & PF_THREAD_BOUND) {
+ cpumask_t mask;
+
+ mutex_lock(&callback_mutex);
+ mask = cs->cpus_allowed;
+ mutex_unlock(&callback_mutex);
+ if (!cpus_equal(tsk->cpus_allowed, mask))
+ return -EINVAL;
+ }
return security_task_setscheduler(tsk, 0, NULL);
}
@@ -1207,11 +1216,14 @@ static void cpuset_attach(struct cgroup_subsys *ss,
struct mm_struct *mm;
struct cpuset *cs = cgroup_cs(cont);
struct cpuset *oldcs = cgroup_cs(oldcont);
+ int err;
mutex_lock(&callback_mutex);
guarantee_online_cpus(cs, &cpus);
- set_cpus_allowed_ptr(tsk, &cpus);
+ err = set_cpus_allowed_ptr(tsk, &cpus);
mutex_unlock(&callback_mutex);
+ if (err)
+ return;
from = oldcs->mems_allowed;
to = cs->mems_allowed;
@@ -1890,6 +1902,12 @@ static void common_cpu_mem_hotplug_unplug(void)
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
scan_for_empty_cpusets(&top_cpuset);
+ /*
+ * Scheduler destroys domains on hotplug events.
+ * Rebuild them based on the current settings.
+ */
+ rebuild_sched_domains();
+
cgroup_unlock();
}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index bd1b9ea024e1..97747cdd37c9 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -180,6 +180,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
set_task_cpu(k, cpu);
k->cpus_allowed = cpumask_of_cpu(cpu);
k->rt.nr_cpus_allowed = 1;
+ k->flags |= PF_THREAD_BOUND;
}
EXPORT_SYMBOL(kthread_bind);
diff --git a/kernel/pid.c b/kernel/pid.c
index 20d59fa2d493..30bd5d4b2ac7 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -30,6 +30,7 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/init.h>
+#include <linux/rculist.h>
#include <linux/bootmem.h>
#include <linux/hash.h>
#include <linux/pid_namespace.h>
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index ef39700412e9..b5b90b25e49f 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -502,10 +502,38 @@ void rcu_check_callbacks(int cpu, int user)
if (user ||
(idle_cpu(cpu) && !in_softirq() &&
hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+
+ /*
+ * Get here if this CPU took its interrupt from user
+ * mode or from the idle loop, and if this is not a
+ * nested interrupt. In this case, the CPU is in
+ * a quiescent state, so count it.
+ *
+ * Also do a memory barrier. This is needed to handle
+ * the case where writes from a preempt-disable section
+ * of code get reordered into schedule() by this CPU's
+ * write buffer. The memory barrier makes sure that
+ * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
+ * by other CPUs to happen after any such write.
+ */
+
+ smp_mb(); /* See above block comment. */
rcu_qsctr_inc(cpu);
rcu_bh_qsctr_inc(cpu);
- } else if (!in_softirq())
+
+ } else if (!in_softirq()) {
+
+ /*
+ * Get here if this CPU did not take its interrupt from
+ * softirq, in other words, if it is not interrupting
+ * a rcu_bh read-side critical section. This is an _bh
+ * critical section, so count it. The memory barrier
+ * is needed for the same reason as is the above one.
+ */
+
+ smp_mb(); /* See above block comment. */
rcu_bh_qsctr_inc(cpu);
+ }
raise_rcu_softirq();
}
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c09605f8d16c..4a74b8d48d90 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -39,16 +39,16 @@
#include <linux/sched.h>
#include <asm/atomic.h>
#include <linux/bitops.h>
-#include <linux/completion.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/mutex.h>
#include <linux/module.h>
-struct rcu_synchronize {
- struct rcu_head head;
- struct completion completion;
+enum rcu_barrier {
+ RCU_BARRIER_STD,
+ RCU_BARRIER_BH,
+ RCU_BARRIER_SCHED,
};
static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
@@ -60,7 +60,7 @@ static struct completion rcu_barrier_completion;
* Awaken the corresponding synchronize_rcu() instance now that a
* grace period has elapsed.
*/
-static void wakeme_after_rcu(struct rcu_head *head)
+void wakeme_after_rcu(struct rcu_head *head)
{
struct rcu_synchronize *rcu;
@@ -77,17 +77,7 @@ static void wakeme_after_rcu(struct rcu_head *head)
* sections are delimited by rcu_read_lock() and rcu_read_unlock(),
* and may be nested.
*/
-void synchronize_rcu(void)
-{
- struct rcu_synchronize rcu;
-
- init_completion(&rcu.completion);
- /* Will wake me after RCU finished */
- call_rcu(&rcu.head, wakeme_after_rcu);
-
- /* Wait for it */
- wait_for_completion(&rcu.completion);
-}
+synchronize_rcu_xxx(synchronize_rcu, call_rcu)
EXPORT_SYMBOL_GPL(synchronize_rcu);
static void rcu_barrier_callback(struct rcu_head *notused)
@@ -99,19 +89,30 @@ static void rcu_barrier_callback(struct rcu_head *notused)
/*
* Called with preemption disabled, and from cross-cpu IRQ context.
*/
-static void rcu_barrier_func(void *notused)
+static void rcu_barrier_func(void *type)
{
int cpu = smp_processor_id();
struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
atomic_inc(&rcu_barrier_cpu_count);
- call_rcu(head, rcu_barrier_callback);
+ switch ((enum rcu_barrier)type) {
+ case RCU_BARRIER_STD:
+ call_rcu(head, rcu_barrier_callback);
+ break;
+ case RCU_BARRIER_BH:
+ call_rcu_bh(head, rcu_barrier_callback);
+ break;
+ case RCU_BARRIER_SCHED:
+ call_rcu_sched(head, rcu_barrier_callback);
+ break;
+ }
}
-/**
- * rcu_barrier - Wait until all the in-flight RCUs are complete.
+/*
+ * Orchestrate the specified type of RCU barrier, waiting for all
+ * RCU callbacks of the specified type to complete.
*/
-void rcu_barrier(void)
+static void _rcu_barrier(enum rcu_barrier type)
{
BUG_ON(in_interrupt());
/* Take cpucontrol mutex to protect against CPU hotplug */
@@ -127,13 +128,39 @@ void rcu_barrier(void)
* until all the callbacks are queued.
*/
rcu_read_lock();
- on_each_cpu(rcu_barrier_func, NULL, 0, 1);
+ on_each_cpu(rcu_barrier_func, (void *)type, 0, 1);
rcu_read_unlock();
wait_for_completion(&rcu_barrier_completion);
mutex_unlock(&rcu_barrier_mutex);
}
+
+/**
+ * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
+ */
+void rcu_barrier(void)
+{
+ _rcu_barrier(RCU_BARRIER_STD);
+}
EXPORT_SYMBOL_GPL(rcu_barrier);
+/**
+ * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
+ */
+void rcu_barrier_bh(void)
+{
+ _rcu_barrier(RCU_BARRIER_BH);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_bh);
+
+/**
+ * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
+ */
+void rcu_barrier_sched(void)
+{
+ _rcu_barrier(RCU_BARRIER_SCHED);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_sched);
+
void __init rcu_init(void)
{
__rcu_init();
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index c69000fd5c3a..e66167decb8f 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -46,11 +46,11 @@
#include <asm/atomic.h>
#include <linux/bitops.h>
#include <linux/module.h>
+#include <linux/kthread.h>
#include <linux/completion.h>
#include <linux/moduleparam.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
-#include <linux/rcupdate.h>
#include <linux/cpu.h>
#include <linux/random.h>
#include <linux/delay.h>
@@ -87,9 +87,14 @@ struct rcu_data {
struct rcu_head **nexttail;
struct rcu_head *waitlist[GP_STAGES];
struct rcu_head **waittail[GP_STAGES];
- struct rcu_head *donelist;
+ struct rcu_head *donelist; /* from waitlist & waitschedlist */
struct rcu_head **donetail;
long rcu_flipctr[2];
+ struct rcu_head *nextschedlist;
+ struct rcu_head **nextschedtail;
+ struct rcu_head *waitschedlist;
+ struct rcu_head **waitschedtail;
+ int rcu_sched_sleeping;
#ifdef CONFIG_RCU_TRACE
struct rcupreempt_trace trace;
#endif /* #ifdef CONFIG_RCU_TRACE */
@@ -131,11 +136,24 @@ enum rcu_try_flip_states {
rcu_try_flip_waitmb_state,
};
+/*
+ * States for rcu_ctrlblk.rcu_sched_sleep.
+ */
+
+enum rcu_sched_sleep_states {
+ rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP. */
+ rcu_sched_sleep_prep, /* Thinking of sleeping, rechecking. */
+ rcu_sched_sleeping, /* Sleeping, awaken if GP needed. */
+};
+
struct rcu_ctrlblk {
spinlock_t fliplock; /* Protect state-machine transitions. */
long completed; /* Number of last completed batch. */
enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
the rcu state machine */
+ spinlock_t schedlock; /* Protect rcu_sched sleep state. */
+ enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */
+ wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */
};
static DEFINE_PER_CPU(struct rcu_data, rcu_data);
@@ -143,8 +161,12 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
.fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
.completed = 0,
.rcu_try_flip_state = rcu_try_flip_idle_state,
+ .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock),
+ .sched_sleep = rcu_sched_not_sleeping,
+ .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq),
};
+static struct task_struct *rcu_sched_grace_period_task;
#ifdef CONFIG_RCU_TRACE
static char *rcu_try_flip_state_names[] =
@@ -207,6 +229,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
*/
#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
+#define RCU_SCHED_BATCH_TIME (HZ / 50)
+
/*
* Return the number of RCU batches processed thus far. Useful
* for debug and statistics.
@@ -411,32 +435,34 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp)
}
}
-#ifdef CONFIG_NO_HZ
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
+ .dynticks = 1,
+};
-DEFINE_PER_CPU(long, dynticks_progress_counter) = 1;
-static DEFINE_PER_CPU(long, rcu_dyntick_snapshot);
+#ifdef CONFIG_NO_HZ
static DEFINE_PER_CPU(int, rcu_update_flag);
/**
* rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
*
* If the CPU was idle with dynamic ticks active, this updates the
- * dynticks_progress_counter to let the RCU handling know that the
+ * rcu_dyntick_sched.dynticks to let the RCU handling know that the
* CPU is active.
*/
void rcu_irq_enter(void)
{
int cpu = smp_processor_id();
+ struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
if (per_cpu(rcu_update_flag, cpu))
per_cpu(rcu_update_flag, cpu)++;
/*
* Only update if we are coming from a stopped ticks mode
- * (dynticks_progress_counter is even).
+ * (rcu_dyntick_sched.dynticks is even).
*/
if (!in_interrupt() &&
- (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) {
+ (rdssp->dynticks & 0x1) == 0) {
/*
* The following might seem like we could have a race
* with NMI/SMIs. But this really isn't a problem.
@@ -459,12 +485,12 @@ void rcu_irq_enter(void)
* RCU read-side critical sections on this CPU would
* have already completed.
*/
- per_cpu(dynticks_progress_counter, cpu)++;
+ rdssp->dynticks++;
/*
* The following memory barrier ensures that any
* rcu_read_lock() primitives in the irq handler
* are seen by other CPUs to follow the above
- * increment to dynticks_progress_counter. This is
+ * increment to rcu_dyntick_sched.dynticks. This is
* required in order for other CPUs to correctly
* determine when it is safe to advance the RCU
* grace-period state machine.
@@ -472,7 +498,7 @@ void rcu_irq_enter(void)
smp_mb(); /* see above block comment. */
/*
* Since we can't determine the dynamic tick mode from
- * the dynticks_progress_counter after this routine,
+ * the rcu_dyntick_sched.dynticks after this routine,
* we use a second flag to acknowledge that we came
* from an idle state with ticks stopped.
*/
@@ -480,7 +506,7 @@ void rcu_irq_enter(void)
/*
* If we take an NMI/SMI now, they will also increment
* the rcu_update_flag, and will not update the
- * dynticks_progress_counter on exit. That is for
+ * rcu_dyntick_sched.dynticks on exit. That is for
* this IRQ to do.
*/
}
@@ -490,12 +516,13 @@ void rcu_irq_enter(void)
* rcu_irq_exit - Called from exiting Hard irq context.
*
* If the CPU was idle with dynamic ticks active, update the
- * dynticks_progress_counter to put let the RCU handling be
+ * rcu_dyntick_sched.dynticks to put let the RCU handling be
* aware that the CPU is going back to idle with no ticks.
*/
void rcu_irq_exit(void)
{
int cpu = smp_processor_id();
+ struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
/*
* rcu_update_flag is set if we interrupted the CPU
@@ -503,7 +530,7 @@ void rcu_irq_exit(void)
* Once this occurs, we keep track of interrupt nesting
* because a NMI/SMI could also come in, and we still
* only want the IRQ that started the increment of the
- * dynticks_progress_counter to be the one that modifies
+ * rcu_dyntick_sched.dynticks to be the one that modifies
* it on exit.
*/
if (per_cpu(rcu_update_flag, cpu)) {
@@ -515,28 +542,29 @@ void rcu_irq_exit(void)
/*
* If an NMI/SMI happens now we are still
- * protected by the dynticks_progress_counter being odd.
+ * protected by the rcu_dyntick_sched.dynticks being odd.
*/
/*
* The following memory barrier ensures that any
* rcu_read_unlock() primitives in the irq handler
* are seen by other CPUs to preceed the following
- * increment to dynticks_progress_counter. This
+ * increment to rcu_dyntick_sched.dynticks. This
* is required in order for other CPUs to determine
* when it is safe to advance the RCU grace-period
* state machine.
*/
smp_mb(); /* see above block comment. */
- per_cpu(dynticks_progress_counter, cpu)++;
- WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1);
+ rdssp->dynticks++;
+ WARN_ON(rdssp->dynticks & 0x1);
}
}
static void dyntick_save_progress_counter(int cpu)
{
- per_cpu(rcu_dyntick_snapshot, cpu) =
- per_cpu(dynticks_progress_counter, cpu);
+ struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+ rdssp->dynticks_snap = rdssp->dynticks;
}
static inline int
@@ -544,9 +572,10 @@ rcu_try_flip_waitack_needed(int cpu)
{
long curr;
long snap;
+ struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
- curr = per_cpu(dynticks_progress_counter, cpu);
- snap = per_cpu(rcu_dyntick_snapshot, cpu);
+ curr = rdssp->dynticks;
+ snap = rdssp->dynticks_snap;
smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
/*
@@ -567,7 +596,7 @@ rcu_try_flip_waitack_needed(int cpu)
* that this CPU already acknowledged the counter.
*/
- if ((curr - snap) > 2 || (snap & 0x1) == 0)
+ if ((curr - snap) > 2 || (curr & 0x1) == 0)
return 0;
/* We need this CPU to explicitly acknowledge the counter flip. */
@@ -580,9 +609,10 @@ rcu_try_flip_waitmb_needed(int cpu)
{
long curr;
long snap;
+ struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
- curr = per_cpu(dynticks_progress_counter, cpu);
- snap = per_cpu(rcu_dyntick_snapshot, cpu);
+ curr = rdssp->dynticks;
+ snap = rdssp->dynticks_snap;
smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
/*
@@ -609,14 +639,86 @@ rcu_try_flip_waitmb_needed(int cpu)
return 1;
}
+static void dyntick_save_progress_counter_sched(int cpu)
+{
+ struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+ rdssp->sched_dynticks_snap = rdssp->dynticks;
+}
+
+static int rcu_qsctr_inc_needed_dyntick(int cpu)
+{
+ long curr;
+ long snap;
+ struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+ curr = rdssp->dynticks;
+ snap = rdssp->sched_dynticks_snap;
+ smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+
+ /*
+ * If the CPU remained in dynticks mode for the entire time
+ * and didn't take any interrupts, NMIs, SMIs, or whatever,
+ * then it cannot be in the middle of an rcu_read_lock(), so
+ * the next rcu_read_lock() it executes must use the new value
+ * of the counter. Therefore, this CPU has been in a quiescent
+ * state the entire time, and we don't need to wait for it.
+ */
+
+ if ((curr == snap) && ((curr & 0x1) == 0))
+ return 0;
+
+ /*
+ * If the CPU passed through or entered a dynticks idle phase with
+ * no active irq handlers, then, as above, this CPU has already
+ * passed through a quiescent state.
+ */
+
+ if ((curr - snap) > 2 || (snap & 0x1) == 0)
+ return 0;
+
+ /* We need this CPU to go through a quiescent state. */
+
+ return 1;
+}
+
#else /* !CONFIG_NO_HZ */
-# define dyntick_save_progress_counter(cpu) do { } while (0)
-# define rcu_try_flip_waitack_needed(cpu) (1)
-# define rcu_try_flip_waitmb_needed(cpu) (1)
+# define dyntick_save_progress_counter(cpu) do { } while (0)
+# define rcu_try_flip_waitack_needed(cpu) (1)
+# define rcu_try_flip_waitmb_needed(cpu) (1)
+
+# define dyntick_save_progress_counter_sched(cpu) do { } while (0)
+# define rcu_qsctr_inc_needed_dyntick(cpu) (1)
#endif /* CONFIG_NO_HZ */
+static void save_qsctr_sched(int cpu)
+{
+ struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+ rdssp->sched_qs_snap = rdssp->sched_qs;
+}
+
+static inline int rcu_qsctr_inc_needed(int cpu)
+{
+ struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+ /*
+ * If there has been a quiescent state, no more need to wait
+ * on this CPU.
+ */
+
+ if (rdssp->sched_qs != rdssp->sched_qs_snap) {
+ smp_mb(); /* force ordering with cpu entering schedule(). */
+ return 0;
+ }
+
+ /* We need this CPU to go through a quiescent state. */
+
+ return 1;
+}
+
/*
* Get here when RCU is idle. Decide whether we need to
* move out of idle state, and return non-zero if so.
@@ -819,6 +921,26 @@ void rcu_check_callbacks(int cpu, int user)
unsigned long flags;
struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+ /*
+ * If this CPU took its interrupt from user mode or from the
+ * idle loop, and this is not a nested interrupt, then
+ * this CPU has to have exited all prior preept-disable
+ * sections of code. So increment the counter to note this.
+ *
+ * The memory barrier is needed to handle the case where
+ * writes from a preempt-disable section of code get reordered
+ * into schedule() by this CPU's write buffer. So the memory
+ * barrier makes sure that the rcu_qsctr_inc() is seen by other
+ * CPUs to happen after any such write.
+ */
+
+ if (user ||
+ (idle_cpu(cpu) && !in_softirq() &&
+ hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+ smp_mb(); /* Guard against aggressive schedule(). */
+ rcu_qsctr_inc(cpu);
+ }
+
rcu_check_mb(cpu);
if (rcu_ctrlblk.completed == rdp->completed)
rcu_try_flip();
@@ -869,6 +991,8 @@ void rcu_offline_cpu(int cpu)
struct rcu_head *list = NULL;
unsigned long flags;
struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+ struct rcu_head *schedlist = NULL;
+ struct rcu_head **schedtail = &schedlist;
struct rcu_head **tail = &list;
/*
@@ -882,6 +1006,11 @@ void rcu_offline_cpu(int cpu)
rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
list, tail);
rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
+ rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail,
+ schedlist, schedtail);
+ rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail,
+ schedlist, schedtail);
+ rdp->rcu_sched_sleeping = 0;
spin_unlock_irqrestore(&rdp->lock, flags);
rdp->waitlistcount = 0;
@@ -916,22 +1045,40 @@ void rcu_offline_cpu(int cpu)
* fix.
*/
- local_irq_save(flags);
+ local_irq_save(flags); /* disable preempt till we know what lock. */
rdp = RCU_DATA_ME();
spin_lock(&rdp->lock);
*rdp->nexttail = list;
if (list)
rdp->nexttail = tail;
+ *rdp->nextschedtail = schedlist;
+ if (schedlist)
+ rdp->nextschedtail = schedtail;
spin_unlock_irqrestore(&rdp->lock, flags);
}
void __devinit rcu_online_cpu(int cpu)
{
unsigned long flags;
+ struct rcu_data *rdp;
spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
cpu_set(cpu, rcu_cpu_online_map);
spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
+
+ /*
+ * The rcu_sched grace-period processing might have bypassed
+ * this CPU, given that it was not in the rcu_cpu_online_map
+ * when the grace-period scan started. This means that the
+ * grace-period task might sleep. So make sure that if this
+ * should happen, the first callback posted to this CPU will
+ * wake up the grace-period task if need be.
+ */
+
+ rdp = RCU_DATA_CPU(cpu);
+ spin_lock_irqsave(&rdp->lock, flags);
+ rdp->rcu_sched_sleeping = 1;
+ spin_unlock_irqrestore(&rdp->lock, flags);
}
#else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -986,31 +1133,196 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
*rdp->nexttail = head;
rdp->nexttail = &head->next;
RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
- spin_unlock(&rdp->lock);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&rdp->lock, flags);
}
EXPORT_SYMBOL_GPL(call_rcu);
+void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+ unsigned long flags;
+ struct rcu_data *rdp;
+ int wake_gp = 0;
+
+ head->func = func;
+ head->next = NULL;
+ local_irq_save(flags);
+ rdp = RCU_DATA_ME();
+ spin_lock(&rdp->lock);
+ *rdp->nextschedtail = head;
+ rdp->nextschedtail = &head->next;
+ if (rdp->rcu_sched_sleeping) {
+
+ /* Grace-period processing might be sleeping... */
+
+ rdp->rcu_sched_sleeping = 0;
+ wake_gp = 1;
+ }
+ spin_unlock_irqrestore(&rdp->lock, flags);
+ if (wake_gp) {
+
+ /* Wake up grace-period processing, unless someone beat us. */
+
+ spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
+ if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping)
+ wake_gp = 0;
+ rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping;
+ spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+ if (wake_gp)
+ wake_up_interruptible(&rcu_ctrlblk.sched_wq);
+ }
+}
+EXPORT_SYMBOL_GPL(call_rcu_sched);
+
/*
* Wait until all currently running preempt_disable() code segments
* (including hardware-irq-disable segments) complete. Note that
* in -rt this does -not- necessarily result in all currently executing
* interrupt -handlers- having completed.
*/
-void __synchronize_sched(void)
+synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched)
+EXPORT_SYMBOL_GPL(__synchronize_sched);
+
+/*
+ * kthread function that manages call_rcu_sched grace periods.
+ */
+static int rcu_sched_grace_period(void *arg)
{
- cpumask_t oldmask;
+ int couldsleep; /* might sleep after current pass. */
+ int couldsleepnext = 0; /* might sleep after next pass. */
int cpu;
+ unsigned long flags;
+ struct rcu_data *rdp;
+ int ret;
- if (sched_getaffinity(0, &oldmask) < 0)
- oldmask = cpu_possible_map;
- for_each_online_cpu(cpu) {
- sched_setaffinity(0, &cpumask_of_cpu(cpu));
- schedule();
- }
- sched_setaffinity(0, &oldmask);
+ /*
+ * Each pass through the following loop handles one
+ * rcu_sched grace period cycle.
+ */
+ do {
+ /* Save each CPU's current state. */
+
+ for_each_online_cpu(cpu) {
+ dyntick_save_progress_counter_sched(cpu);
+ save_qsctr_sched(cpu);
+ }
+
+ /*
+ * Sleep for about an RCU grace-period's worth to
+ * allow better batching and to consume less CPU.
+ */
+ schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME);
+
+ /*
+ * If there was nothing to do last time, prepare to
+ * sleep at the end of the current grace period cycle.
+ */
+ couldsleep = couldsleepnext;
+ couldsleepnext = 1;
+ if (couldsleep) {
+ spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
+ rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep;
+ spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+ }
+
+ /*
+ * Wait on each CPU in turn to have either visited
+ * a quiescent state or been in dynticks-idle mode.
+ */
+ for_each_online_cpu(cpu) {
+ while (rcu_qsctr_inc_needed(cpu) &&
+ rcu_qsctr_inc_needed_dyntick(cpu)) {
+ /* resched_cpu(cpu); @@@ */
+ schedule_timeout_interruptible(1);
+ }
+ }
+
+ /* Advance callbacks for each CPU. */
+
+ for_each_online_cpu(cpu) {
+
+ rdp = RCU_DATA_CPU(cpu);
+ spin_lock_irqsave(&rdp->lock, flags);
+
+ /*
+ * We are running on this CPU irq-disabled, so no
+ * CPU can go offline until we re-enable irqs.
+ * The current CPU might have already gone
+ * offline (between the for_each_offline_cpu and
+ * the spin_lock_irqsave), but in that case all its
+ * callback lists will be empty, so no harm done.
+ *
+ * Advance the callbacks! We share normal RCU's
+ * donelist, since callbacks are invoked the
+ * same way in either case.
+ */
+ if (rdp->waitschedlist != NULL) {
+ *rdp->donetail = rdp->waitschedlist;
+ rdp->donetail = rdp->waitschedtail;
+
+ /*
+ * Next rcu_check_callbacks() will
+ * do the required raise_softirq().
+ */
+ }
+ if (rdp->nextschedlist != NULL) {
+ rdp->waitschedlist = rdp->nextschedlist;
+ rdp->waitschedtail = rdp->nextschedtail;
+ couldsleep = 0;
+ couldsleepnext = 0;
+ } else {
+ rdp->waitschedlist = NULL;
+ rdp->waitschedtail = &rdp->waitschedlist;
+ }
+ rdp->nextschedlist = NULL;
+ rdp->nextschedtail = &rdp->nextschedlist;
+
+ /* Mark sleep intention. */
+
+ rdp->rcu_sched_sleeping = couldsleep;
+
+ spin_unlock_irqrestore(&rdp->lock, flags);
+ }
+
+ /* If we saw callbacks on the last scan, go deal with them. */
+
+ if (!couldsleep)
+ continue;
+
+ /* Attempt to block... */
+
+ spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
+ if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) {
+
+ /*
+ * Someone posted a callback after we scanned.
+ * Go take care of it.
+ */
+ spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+ couldsleepnext = 0;
+ continue;
+ }
+
+ /* Block until the next person posts a callback. */
+
+ rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
+ spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+ ret = 0;
+ __wait_event_interruptible(rcu_ctrlblk.sched_wq,
+ rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
+ ret);
+
+ /*
+ * Signals would prevent us from sleeping, and we cannot
+ * do much with them in any case. So flush them.
+ */
+ if (ret)
+ flush_signals(current);
+ couldsleepnext = 0;
+
+ } while (!kthread_should_stop());
+
+ return (0);
}
-EXPORT_SYMBOL_GPL(__synchronize_sched);
/*
* Check to see if any future RCU-related work will need to be done
@@ -1027,7 +1339,9 @@ int rcu_needs_cpu(int cpu)
return (rdp->donelist != NULL ||
!!rdp->waitlistcount ||
- rdp->nextlist != NULL);
+ rdp->nextlist != NULL ||
+ rdp->nextschedlist != NULL ||
+ rdp->waitschedlist != NULL);
}
int rcu_pending(int cpu)
@@ -1038,7 +1352,9 @@ int rcu_pending(int cpu)
if (rdp->donelist != NULL ||
!!rdp->waitlistcount ||
- rdp->nextlist != NULL)
+ rdp->nextlist != NULL ||
+ rdp->nextschedlist != NULL ||
+ rdp->waitschedlist != NULL)
return 1;
/* The RCU core needs an acknowledgement from this CPU. */
@@ -1105,6 +1421,11 @@ void __init __rcu_init(void)
rdp->donetail = &rdp->donelist;
rdp->rcu_flipctr[0] = 0;
rdp->rcu_flipctr[1] = 0;
+ rdp->nextschedlist = NULL;
+ rdp->nextschedtail = &rdp->nextschedlist;
+ rdp->waitschedlist = NULL;
+ rdp->waitschedtail = &rdp->waitschedlist;
+ rdp->rcu_sched_sleeping = 0;
}
register_cpu_notifier(&rcu_nb);
@@ -1127,11 +1448,15 @@ void __init __rcu_init(void)
}
/*
- * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
+ * Late-boot-time RCU initialization that must wait until after scheduler
+ * has been initialized.
*/
-void synchronize_kernel(void)
+void __init rcu_init_sched(void)
{
- synchronize_rcu();
+ rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period,
+ NULL,
+ "rcu_sched_grace_period");
+ WARN_ON(IS_ERR(rcu_sched_grace_period_task));
}
#ifdef CONFIG_RCU_TRACE
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 49ac4947af24..5edf82c34bbc 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -38,7 +38,6 @@
#include <linux/moduleparam.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
-#include <linux/rcupdate.h>
#include <linux/cpu.h>
#include <linux/mutex.h>
#include <linux/rcupreempt_trace.h>
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 33acc424667e..0334b6a8baca 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -192,6 +192,7 @@ struct rcu_torture_ops {
int (*completed)(void);
void (*deferredfree)(struct rcu_torture *p);
void (*sync)(void);
+ void (*cb_barrier)(void);
int (*stats)(char *page);
char *name;
};
@@ -265,6 +266,7 @@ static struct rcu_torture_ops rcu_ops = {
.completed = rcu_torture_completed,
.deferredfree = rcu_torture_deferred_free,
.sync = synchronize_rcu,
+ .cb_barrier = rcu_barrier,
.stats = NULL,
.name = "rcu"
};
@@ -304,6 +306,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
.completed = rcu_torture_completed,
.deferredfree = rcu_sync_torture_deferred_free,
.sync = synchronize_rcu,
+ .cb_barrier = NULL,
.stats = NULL,
.name = "rcu_sync"
};
@@ -364,6 +367,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
.completed = rcu_bh_torture_completed,
.deferredfree = rcu_bh_torture_deferred_free,
.sync = rcu_bh_torture_synchronize,
+ .cb_barrier = rcu_barrier_bh,
.stats = NULL,
.name = "rcu_bh"
};
@@ -377,6 +381,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
.completed = rcu_bh_torture_completed,
.deferredfree = rcu_sync_torture_deferred_free,
.sync = rcu_bh_torture_synchronize,
+ .cb_barrier = NULL,
.stats = NULL,
.name = "rcu_bh_sync"
};
@@ -458,6 +463,7 @@ static struct rcu_torture_ops srcu_ops = {
.completed = srcu_torture_completed,
.deferredfree = rcu_sync_torture_deferred_free,
.sync = srcu_torture_synchronize,
+ .cb_barrier = NULL,
.stats = srcu_torture_stats,
.name = "srcu"
};
@@ -482,6 +488,11 @@ static int sched_torture_completed(void)
return 0;
}
+static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
+{
+ call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
+}
+
static void sched_torture_synchronize(void)
{
synchronize_sched();
@@ -494,12 +505,27 @@ static struct rcu_torture_ops sched_ops = {
.readdelay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = sched_torture_read_unlock,
.completed = sched_torture_completed,
- .deferredfree = rcu_sync_torture_deferred_free,
+ .deferredfree = rcu_sched_torture_deferred_free,
.sync = sched_torture_synchronize,
+ .cb_barrier = rcu_barrier_sched,
.stats = NULL,
.name = "sched"
};
+static struct rcu_torture_ops sched_ops_sync = {
+ .init = rcu_sync_torture_init,
+ .cleanup = NULL,
+ .readlock = sched_torture_read_lock,
+ .readdelay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = sched_torture_read_unlock,
+ .completed = sched_torture_completed,
+ .deferredfree = rcu_sync_torture_deferred_free,
+ .sync = sched_torture_synchronize,
+ .cb_barrier = NULL,
+ .stats = NULL,
+ .name = "sched_sync"
+};
+
/*
* RCU torture writer kthread. Repeatedly substitutes a new structure
* for that pointed to by rcu_torture_current, freeing the old structure
@@ -848,7 +874,9 @@ rcu_torture_cleanup(void)
stats_task = NULL;
/* Wait for all RCU callbacks to fire. */
- rcu_barrier();
+
+ if (cur_ops->cb_barrier != NULL)
+ cur_ops->cb_barrier();
rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
@@ -868,7 +896,7 @@ rcu_torture_init(void)
int firsterr = 0;
static struct rcu_torture_ops *torture_ops[] =
{ &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
- &srcu_ops, &sched_ops, };
+ &srcu_ops, &sched_ops, &sched_ops_sync, };
/* Process args and tell the world that the torturer is on the job. */
for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
diff --git a/kernel/sched.c b/kernel/sched.c
index 627105dfdd90..81f8ac566ff9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,6 +75,8 @@
#include <asm/tlb.h>
#include <asm/irq_regs.h>
+#include "sched_cpupri.h"
+
/*
* Convert user-nice values [ -20 ... 0 ... 19 ]
* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -152,7 +154,8 @@ static inline int task_has_rt_policy(struct task_struct *p)
*/
struct rt_prio_array {
DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
- struct list_head queue[MAX_RT_PRIO];
+ struct list_head xqueue[MAX_RT_PRIO]; /* exclusive queue */
+ struct list_head squeue[MAX_RT_PRIO]; /* shared queue */
};
struct rt_bandwidth {
@@ -290,15 +293,15 @@ struct task_group root_task_group;
static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
/* Default task group's cfs_rq on each cpu */
static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
-#endif
-#else
+#endif /* CONFIG_RT_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED */
#define root_task_group init_task_group
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
/* task_group_lock serializes add/remove of task groups and also changes to
* a task group's cpu shares.
@@ -308,9 +311,9 @@ static DEFINE_SPINLOCK(task_group_lock);
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_USER_SCHED
# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
-#else
+#else /* !CONFIG_USER_SCHED */
# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
-#endif
+#endif /* CONFIG_USER_SCHED */
/*
* A weight of 0 or 1 can cause arithmetics problems.
@@ -453,6 +456,9 @@ struct root_domain {
*/
cpumask_t rto_mask;
atomic_t rto_count;
+#ifdef CONFIG_SMP
+ struct cpupri cpupri;
+#endif
};
/*
@@ -527,6 +533,7 @@ struct rq {
int push_cpu;
/* cpu of this runqueue: */
int cpu;
+ int online;
struct task_struct *migration_thread;
struct list_head migration_queue;
@@ -1148,6 +1155,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
+#ifdef CONFIG_SMP
static void hotplug_hrtick_disable(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -1203,6 +1211,7 @@ static void init_hrtick(void)
{
hotcpu_notifier(hotplug_hrtick, 0);
}
+#endif /* CONFIG_SMP */
static void init_rq_hrtick(struct rq *rq)
{
@@ -1332,15 +1341,15 @@ void wake_up_idle_cpu(int cpu)
if (!tsk_is_polling(rq->idle))
smp_send_reschedule(cpu);
}
-#endif
+#endif /* CONFIG_NO_HZ */
-#else
+#else /* !CONFIG_SMP */
static void __resched_task(struct task_struct *p, int tif_bit)
{
assert_spin_locked(&task_rq(p)->lock);
set_tsk_thread_flag(p, tif_bit);
}
-#endif
+#endif /* CONFIG_SMP */
#if BITS_PER_LONG == 32
# define WMULT_CONST (~0UL)
@@ -1500,16 +1509,8 @@ static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
static unsigned long cpu_avg_load_per_task(int cpu);
static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-#else /* CONFIG_SMP */
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
-{
-}
#endif
-#endif /* CONFIG_SMP */
-
#include "sched_stats.h"
#include "sched_idletask.c"
#include "sched_fair.c"
@@ -1519,6 +1520,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
#endif
#define sched_class_highest (&rt_sched_class)
+#define for_each_class(class) \
+ for (class = sched_class_highest; class; class = class->next)
static inline void inc_load(struct rq *rq, const struct task_struct *p)
{
@@ -1655,12 +1658,6 @@ inline int task_curr(const struct task_struct *p)
return cpu_curr(task_cpu(p)) == p;
}
-/* Used instead of source_load when we know the type == 0 */
-unsigned long weighted_cpuload(const int cpu)
-{
- return cpu_rq(cpu)->load.weight;
-}
-
static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
{
set_task_rq(p, cpu);
@@ -1689,6 +1686,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
#ifdef CONFIG_SMP
+/* Used instead of source_load when we know the type == 0 */
+static unsigned long weighted_cpuload(const int cpu)
+{
+ return cpu_rq(cpu)->load.weight;
+}
+
/*
* Is this task likely cache-hot:
*/
@@ -2151,7 +2154,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
}
}
}
-#endif
+#endif /* CONFIG_SCHEDSTATS */
out_activate:
#endif /* CONFIG_SMP */
@@ -2353,7 +2356,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
notifier->ops->sched_out(notifier, next);
}
-#else
+#else /* !CONFIG_PREEMPT_NOTIFIERS */
static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
@@ -2365,7 +2368,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
{
}
-#endif
+#endif /* CONFIG_PREEMPT_NOTIFIERS */
/**
* prepare_task_switch - prepare to switch tasks
@@ -3697,6 +3700,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
+ int need_serialize;
cpumask_t tmp;
for_each_domain(cpu, sd) {
@@ -3714,8 +3718,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
if (interval > HZ*NR_CPUS/10)
interval = HZ*NR_CPUS/10;
+ need_serialize = sd->flags & SD_SERIALIZE;
- if (sd->flags & SD_SERIALIZE) {
+ if (need_serialize) {
if (!spin_trylock(&balancing))
goto out;
}
@@ -3731,7 +3736,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
}
sd->last_balance = jiffies;
}
- if (sd->flags & SD_SERIALIZE)
+ if (need_serialize)
spin_unlock(&balancing);
out:
if (time_after(next_balance, sd->last_balance + interval)) {
@@ -4116,6 +4121,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
prev->comm, prev->pid, preempt_count());
debug_show_held_locks(prev);
+ print_modules();
if (irqs_disabled())
print_irqtrace_events(prev);
@@ -4189,7 +4195,7 @@ asmlinkage void __sched schedule(void)
struct task_struct *prev, *next;
unsigned long *switch_count;
struct rq *rq;
- int cpu;
+ int cpu, hrtick = sched_feat(HRTICK);
need_resched:
preempt_disable();
@@ -4204,7 +4210,8 @@ need_resched_nonpreemptible:
schedule_debug(prev);
- hrtick_clear(rq);
+ if (hrtick)
+ hrtick_clear(rq);
/*
* Do the rq-clock update outside the rq lock:
@@ -4250,7 +4257,8 @@ need_resched_nonpreemptible:
} else
spin_unlock_irq(&rq->lock);
- hrtick_set(rq);
+ if (hrtick)
+ hrtick_set(rq);
if (unlikely(reacquire_kernel_lock(current) < 0))
goto need_resched_nonpreemptible;
@@ -5118,24 +5126,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
return sched_setaffinity(pid, &new_mask);
}
-/*
- * Represents all cpu's present in the system
- * In systems capable of hotplug, this map could dynamically grow
- * as new cpu's are detected in the system via any platform specific
- * method, such as ACPI for e.g.
- */
-
-cpumask_t cpu_present_map __read_mostly;
-EXPORT_SYMBOL(cpu_present_map);
-
-#ifndef CONFIG_SMP
-cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
-EXPORT_SYMBOL(cpu_online_map);
-
-cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
-EXPORT_SYMBOL(cpu_possible_map);
-#endif
-
long sched_getaffinity(pid_t pid, cpumask_t *mask)
{
struct task_struct *p;
@@ -5619,6 +5609,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
goto out;
}
+ if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
+ !cpus_equal(p->cpus_allowed, *new_mask))) {
+ ret = -EINVAL;
+ goto out;
+ }
+
if (p->sched_class->set_cpus_allowed)
p->sched_class->set_cpus_allowed(p, new_mask);
else {
@@ -6106,6 +6102,36 @@ static void unregister_sched_domain_sysctl(void)
}
#endif
+static void set_rq_online(struct rq *rq)
+{
+ if (!rq->online) {
+ const struct sched_class *class;
+
+ cpu_set(rq->cpu, rq->rd->online);
+ rq->online = 1;
+
+ for_each_class(class) {
+ if (class->rq_online)
+ class->rq_online(rq);
+ }
+ }
+}
+
+static void set_rq_offline(struct rq *rq)
+{
+ if (rq->online) {
+ const struct sched_class *class;
+
+ for_each_class(class) {
+ if (class->rq_offline)
+ class->rq_offline(rq);
+ }
+
+ cpu_clear(rq->cpu, rq->rd->online);
+ rq->online = 0;
+ }
+}
+
/*
* migration_call - callback that gets triggered when a CPU is added.
* Here we can start up the necessary migration thread for the new CPU.
@@ -6143,7 +6169,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) {
BUG_ON(!cpu_isset(cpu, rq->rd->span));
- cpu_set(cpu, rq->rd->online);
+
+ set_rq_online(rq);
}
spin_unlock_irqrestore(&rq->lock, flags);
break;
@@ -6204,7 +6231,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) {
BUG_ON(!cpu_isset(cpu, rq->rd->span));
- cpu_clear(cpu, rq->rd->online);
+ set_rq_offline(rq);
}
spin_unlock_irqrestore(&rq->lock, flags);
break;
@@ -6238,6 +6265,28 @@ void __init migration_init(void)
#ifdef CONFIG_SCHED_DEBUG
+static inline const char *sd_level_to_string(enum sched_domain_level lvl)
+{
+ switch (lvl) {
+ case SD_LV_NONE:
+ return "NONE";
+ case SD_LV_SIBLING:
+ return "SIBLING";
+ case SD_LV_MC:
+ return "MC";
+ case SD_LV_CPU:
+ return "CPU";
+ case SD_LV_NODE:
+ return "NODE";
+ case SD_LV_ALLNODES:
+ return "ALLNODES";
+ case SD_LV_MAX:
+ return "MAX";
+
+ }
+ return "MAX";
+}
+
static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpumask_t *groupmask)
{
@@ -6257,7 +6306,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
return -1;
}
- printk(KERN_CONT "span %s\n", str);
+ printk(KERN_CONT "span %s level %s\n",
+ str, sd_level_to_string(sd->level));
if (!cpu_isset(cpu, sd->span)) {
printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6341,9 +6391,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
}
kfree(groupmask);
}
-#else
+#else /* !CONFIG_SCHED_DEBUG */
# define sched_domain_debug(sd, cpu) do { } while (0)
-#endif
+#endif /* CONFIG_SCHED_DEBUG */
static int sd_degenerate(struct sched_domain *sd)
{
@@ -6403,20 +6453,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
static void rq_attach_root(struct rq *rq, struct root_domain *rd)
{
unsigned long flags;
- const struct sched_class *class;
spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) {
struct root_domain *old_rd = rq->rd;
- for (class = sched_class_highest; class; class = class->next) {
- if (class->leave_domain)
- class->leave_domain(rq);
- }
+ if (cpu_isset(rq->cpu, old_rd->online))
+ set_rq_offline(rq);
cpu_clear(rq->cpu, old_rd->span);
- cpu_clear(rq->cpu, old_rd->online);
if (atomic_dec_and_test(&old_rd->refcount))
kfree(old_rd);
@@ -6427,12 +6473,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
cpu_set(rq->cpu, rd->span);
if (cpu_isset(rq->cpu, cpu_online_map))
- cpu_set(rq->cpu, rd->online);
-
- for (class = sched_class_highest; class; class = class->next) {
- if (class->join_domain)
- class->join_domain(rq);
- }
+ set_rq_online(rq);
spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -6443,6 +6484,8 @@ static void init_rootdomain(struct root_domain *rd)
cpus_clear(rd->span);
cpus_clear(rd->online);
+
+ cpupri_init(&rd->cpupri);
}
static void init_defrootdomain(void)
@@ -6637,7 +6680,7 @@ static void sched_domain_node_span(int node, cpumask_t *span)
cpus_or(*span, *span, *nodemask);
}
}
-#endif
+#endif /* CONFIG_NUMA */
int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
@@ -6656,7 +6699,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
*sg = &per_cpu(sched_group_cpus, cpu);
return cpu;
}
-#endif
+#endif /* CONFIG_SCHED_SMT */
/*
* multi-core sched-domains:
@@ -6664,7 +6707,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
#ifdef CONFIG_SCHED_MC
static DEFINE_PER_CPU(struct sched_domain, core_domains);
static DEFINE_PER_CPU(struct sched_group, sched_group_core);
-#endif
+#endif /* CONFIG_SCHED_MC */
#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
static int
@@ -6766,7 +6809,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
sg = sg->next;
} while (sg != group_head);
}
-#endif
+#endif /* CONFIG_NUMA */
#ifdef CONFIG_NUMA
/* Free memory allocated for various sched_group structures */
@@ -6803,11 +6846,11 @@ next_sg:
sched_group_nodes_bycpu[cpu] = NULL;
}
}
-#else
+#else /* !CONFIG_NUMA */
static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
{
}
-#endif
+#endif /* CONFIG_NUMA */
/*
* Initialize sched groups cpu_power.
@@ -7289,6 +7332,18 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
}
/*
+ * Free current domain masks.
+ * Called after all cpus are attached to NULL domain.
+ */
+static void free_sched_domains(void)
+{
+ ndoms_cur = 0;
+ if (doms_cur != &fallback_doms)
+ kfree(doms_cur);
+ doms_cur = &fallback_doms;
+}
+
+/*
* Set up scheduler domains and groups. Callers must hold the hotplug lock.
* For now this just excludes isolated cpus, but could be used to
* exclude other special cases in the future.
@@ -7435,6 +7490,7 @@ int arch_reinit_sched_domains(void)
get_online_cpus();
mutex_lock(&sched_domains_mutex);
detach_destroy_domains(&cpu_online_map);
+ free_sched_domains();
err = arch_init_sched_domains(&cpu_online_map);
mutex_unlock(&sched_domains_mutex);
put_online_cpus();
@@ -7503,7 +7559,7 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
#endif
return err;
}
-#endif
+#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
/*
* Force a reinitialization of the sched domains hierarchy. The domains
@@ -7514,20 +7570,28 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
static int update_sched_domains(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
+ int cpu = (int)(long)hcpu;
+
switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
+ disable_runtime(cpu_rq(cpu));
+ /* fall-through */
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
detach_destroy_domains(&cpu_online_map);
+ free_sched_domains();
return NOTIFY_OK;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
+
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
+ enable_runtime(cpu_rq(cpu));
+ /* fall-through */
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
/*
@@ -7538,8 +7602,16 @@ static int update_sched_domains(struct notifier_block *nfb,
return NOTIFY_DONE;
}
+#ifndef CONFIG_CPUSETS
+ /*
+ * Create default domain partitioning if cpusets are disabled.
+ * Otherwise we let cpusets rebuild the domains based on the
+ * current setup.
+ */
+
/* The hotplug lock is already held by cpu_up/cpu_down */
arch_init_sched_domains(&cpu_online_map);
+#endif
return NOTIFY_OK;
}
@@ -7601,7 +7673,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
array = &rt_rq->active;
for (i = 0; i < MAX_RT_PRIO; i++) {
- INIT_LIST_HEAD(array->queue + i);
+ INIT_LIST_HEAD(array->xqueue + i);
+ INIT_LIST_HEAD(array->squeue + i);
__clear_bit(i, array->bitmap);
}
/* delimiter for bitsearch: */
@@ -7720,8 +7793,8 @@ void __init sched_init(void)
root_task_group.cfs_rq = (struct cfs_rq **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
-#endif
-#endif
+#endif /* CONFIG_USER_SCHED */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
init_task_group.rt_se = (struct sched_rt_entity **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
@@ -7735,8 +7808,8 @@ void __init sched_init(void)
root_task_group.rt_rq = (struct rt_rq **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
-#endif
-#endif
+#endif /* CONFIG_USER_SCHED */
+#endif /* CONFIG_RT_GROUP_SCHED */
}
#ifdef CONFIG_SMP
@@ -7752,8 +7825,8 @@ void __init sched_init(void)
#ifdef CONFIG_USER_SCHED
init_rt_bandwidth(&root_task_group.rt_bandwidth,
global_rt_period(), RUNTIME_INF);
-#endif
-#endif
+#endif /* CONFIG_USER_SCHED */
+#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_GROUP_SCHED
list_add(&init_task_group.list, &task_groups);
@@ -7763,8 +7836,8 @@ void __init sched_init(void)
INIT_LIST_HEAD(&root_task_group.children);
init_task_group.parent = &root_task_group;
list_add(&init_task_group.siblings, &root_task_group.children);
-#endif
-#endif
+#endif /* CONFIG_USER_SCHED */
+#endif /* CONFIG_GROUP_SCHED */
for_each_possible_cpu(i) {
struct rq *rq;
@@ -7844,6 +7917,7 @@ void __init sched_init(void)
rq->next_balance = jiffies;
rq->push_cpu = 0;
rq->cpu = i;
+ rq->online = 0;
rq->migration_thread = NULL;
INIT_LIST_HEAD(&rq->migration_queue);
rq_attach_root(rq, &def_root_domain);
@@ -8083,7 +8157,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
{
list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
}
-#else
+#else /* !CONFG_FAIR_GROUP_SCHED */
static inline void free_fair_sched_group(struct task_group *tg)
{
}
@@ -8101,7 +8175,7 @@ static inline void register_fair_sched_group(struct task_group *tg, int cpu)
static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
{
}
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
static void free_rt_sched_group(struct task_group *tg)
@@ -8172,7 +8246,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
{
list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
}
-#else
+#else /* !CONFIG_RT_GROUP_SCHED */
static inline void free_rt_sched_group(struct task_group *tg)
{
}
@@ -8190,7 +8264,7 @@ static inline void register_rt_sched_group(struct task_group *tg, int cpu)
static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
{
}
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_GROUP_SCHED
static void free_sched_group(struct task_group *tg)
@@ -8301,7 +8375,7 @@ void sched_move_task(struct task_struct *tsk)
task_rq_unlock(rq, &flags);
}
-#endif
+#endif /* CONFIG_GROUP_SCHED */
#ifdef CONFIG_FAIR_GROUP_SCHED
static void set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -8551,7 +8625,7 @@ static int sched_rt_global_constraints(void)
return ret;
}
-#else
+#else /* !CONFIG_RT_GROUP_SCHED */
static int sched_rt_global_constraints(void)
{
unsigned long flags;
@@ -8569,7 +8643,7 @@ static int sched_rt_global_constraints(void)
return 0;
}
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
int sched_rt_handler(struct ctl_table *table, int write,
struct file *filp, void __user *buffer, size_t *lenp,
@@ -8677,7 +8751,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
return (u64) tg->shares;
}
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
@@ -8701,7 +8775,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
{
return sched_group_rt_period(cgroup_tg(cgrp));
}
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
static struct cftype cpu_files[] = {
#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
new file mode 100644
index 000000000000..52154fefab7e
--- /dev/null
+++ b/kernel/sched_cpupri.c
@@ -0,0 +1,174 @@
+/*
+ * kernel/sched_cpupri.c
+ *
+ * CPU priority management
+ *
+ * Copyright (C) 2007-2008 Novell
+ *
+ * Author: Gregory Haskins <ghaskins@novell.com>
+ *
+ * This code tracks the priority of each CPU so that global migration
+ * decisions are easy to calculate. Each CPU can be in a state as follows:
+ *
+ * (INVALID), IDLE, NORMAL, RT1, ... RT99
+ *
+ * going from the lowest priority to the highest. CPUs in the INVALID state
+ * are not eligible for routing. The system maintains this state with
+ * a 2 dimensional bitmap (the first for priority class, the second for cpus
+ * in that class). Therefore a typical application without affinity
+ * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
+ * searches). For tasks with affinity restrictions, the algorithm has a
+ * worst case complexity of O(min(102, nr_domcpus)), though the scenario that
+ * yields the worst case search is fairly contrived.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include "sched_cpupri.h"
+
+/* Convert between a 140 based task->prio, and our 102 based cpupri */
+static int convert_prio(int prio)
+{
+ int cpupri;
+
+ if (prio == CPUPRI_INVALID)
+ cpupri = CPUPRI_INVALID;
+ else if (prio == MAX_PRIO)
+ cpupri = CPUPRI_IDLE;
+ else if (prio >= MAX_RT_PRIO)
+ cpupri = CPUPRI_NORMAL;
+ else
+ cpupri = MAX_RT_PRIO - prio + 1;
+
+ return cpupri;
+}
+
+#define for_each_cpupri_active(array, idx) \
+ for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \
+ idx < CPUPRI_NR_PRIORITIES; \
+ idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
+
+/**
+ * cpupri_find - find the best (lowest-pri) CPU in the system
+ * @cp: The cpupri context
+ * @p: The task
+ * @lowest_mask: A mask to fill in with selected CPUs
+ *
+ * Note: This function returns the recommended CPUs as calculated during the
+ * current invokation. By the time the call returns, the CPUs may have in
+ * fact changed priorities any number of times. While not ideal, it is not
+ * an issue of correctness since the normal rebalancer logic will correct
+ * any discrepancies created by racing against the uncertainty of the current
+ * priority configuration.
+ *
+ * Returns: (int)bool - CPUs were found
+ */
+int cpupri_find(struct cpupri *cp, struct task_struct *p,
+ cpumask_t *lowest_mask)
+{
+ int idx = 0;
+ int task_pri = convert_prio(p->prio);
+
+ for_each_cpupri_active(cp->pri_active, idx) {
+ struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
+ cpumask_t mask;
+
+ if (idx >= task_pri)
+ break;
+
+ cpus_and(mask, p->cpus_allowed, vec->mask);
+
+ if (cpus_empty(mask))
+ continue;
+
+ *lowest_mask = mask;
+ return 1;
+ }
+
+ return 0;
+}
+
+/**
+ * cpupri_set - update the cpu priority setting
+ * @cp: The cpupri context
+ * @cpu: The target cpu
+ * @pri: The priority (INVALID-RT99) to assign to this CPU
+ *
+ * Note: Assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpupri_set(struct cpupri *cp, int cpu, int newpri)
+{
+ int *currpri = &cp->cpu_to_pri[cpu];
+ int oldpri = *currpri;
+ unsigned long flags;
+
+ newpri = convert_prio(newpri);
+
+ BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
+
+ if (newpri == oldpri)
+ return;
+
+ /*
+ * If the cpu was currently mapped to a different value, we
+ * first need to unmap the old value
+ */
+ if (likely(oldpri != CPUPRI_INVALID)) {
+ struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
+
+ spin_lock_irqsave(&vec->lock, flags);
+
+ vec->count--;
+ if (!vec->count)
+ clear_bit(oldpri, cp->pri_active);
+ cpu_clear(cpu, vec->mask);
+
+ spin_unlock_irqrestore(&vec->lock, flags);
+ }
+
+ if (likely(newpri != CPUPRI_INVALID)) {
+ struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
+
+ spin_lock_irqsave(&vec->lock, flags);
+
+ cpu_set(cpu, vec->mask);
+ vec->count++;
+ if (vec->count == 1)
+ set_bit(newpri, cp->pri_active);
+
+ spin_unlock_irqrestore(&vec->lock, flags);
+ }
+
+ *currpri = newpri;
+}
+
+/**
+ * cpupri_init - initialize the cpupri structure
+ * @cp: The cpupri context
+ *
+ * Returns: (void)
+ */
+void cpupri_init(struct cpupri *cp)
+{
+ int i;
+
+ memset(cp, 0, sizeof(*cp));
+
+ for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
+ struct cpupri_vec *vec = &cp->pri_to_cpu[i];
+
+ spin_lock_init(&vec->lock);
+ vec->count = 0;
+ cpus_clear(vec->mask);
+ }
+
+ for_each_possible_cpu(i)
+ cp->cpu_to_pri[i] = CPUPRI_INVALID;
+}
+
+
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
new file mode 100644
index 000000000000..f25811b0f931
--- /dev/null
+++ b/kernel/sched_cpupri.h
@@ -0,0 +1,36 @@
+#ifndef _LINUX_CPUPRI_H
+#define _LINUX_CPUPRI_H
+
+#include <linux/sched.h>
+
+#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
+#define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES)
+
+#define CPUPRI_INVALID -1
+#define CPUPRI_IDLE 0
+#define CPUPRI_NORMAL 1
+/* values 2-101 are RT priorities 0-99 */
+
+struct cpupri_vec {
+ spinlock_t lock;
+ int count;
+ cpumask_t mask;
+};
+
+struct cpupri {
+ struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
+ long pri_active[CPUPRI_NR_PRI_WORDS];
+ int cpu_to_pri[NR_CPUS];
+};
+
+#ifdef CONFIG_SMP
+int cpupri_find(struct cpupri *cp,
+ struct task_struct *p, cpumask_t *lowest_mask);
+void cpupri_set(struct cpupri *cp, int cpu, int pri);
+void cpupri_init(struct cpupri *cp);
+#else
+#define cpupri_set(cp, cpu, pri) do { } while (0)
+#define cpupri_init() do { } while (0)
+#endif
+
+#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 74774bde5264..0152dbd0b77a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1275,23 +1275,18 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
struct task_struct *p = NULL;
struct sched_entity *se;
- if (next == &cfs_rq->tasks)
- return NULL;
-
- /* Skip over entities that are not tasks */
- do {
+ while (next != &cfs_rq->tasks) {
se = list_entry(next, struct sched_entity, group_node);
next = next->next;
- } while (next != &cfs_rq->tasks && !entity_is_task(se));
- if (next == &cfs_rq->tasks)
- return NULL;
+ /* Skip over entities that are not tasks */
+ if (entity_is_task(se)) {
+ p = task_of(se);
+ break;
+ }
+ }
cfs_rq->balance_iterator = next;
-
- if (entity_is_task(se))
- p = task_of(se);
-
return p;
}
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 1c7283cb9581..62b39ca92ebd 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -6,5 +6,3 @@ SCHED_FEAT(CACHE_HOT_BUDDY, 1)
SCHED_FEAT(SYNC_WAKEUPS, 1)
SCHED_FEAT(HRTICK, 1)
SCHED_FEAT(DOUBLE_TICK, 0)
-SCHED_FEAT(NORMALIZED_SLEEPER, 1)
-SCHED_FEAT(DEADLINE, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 163997018583..49a225e2729e 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -12,6 +12,9 @@ static inline int rt_overloaded(struct rq *rq)
static inline void rt_set_overload(struct rq *rq)
{
+ if (!rq->online)
+ return;
+
cpu_set(rq->cpu, rq->rd->rto_mask);
/*
* Make sure the mask is visible before we set
@@ -26,6 +29,9 @@ static inline void rt_set_overload(struct rq *rq)
static inline void rt_clear_overload(struct rq *rq)
{
+ if (!rq->online)
+ return;
+
/* the order here really doesn't matter */
atomic_dec(&rq->rd->rto_count);
cpu_clear(rq->cpu, rq->rd->rto_mask);
@@ -280,6 +286,9 @@ static int balance_runtime(struct rt_rq *rt_rq)
continue;
spin_lock(&iter->rt_runtime_lock);
+ if (iter->rt_runtime == RUNTIME_INF)
+ goto next;
+
diff = iter->rt_runtime - iter->rt_time;
if (diff > 0) {
do_div(diff, weight);
@@ -293,12 +302,105 @@ static int balance_runtime(struct rt_rq *rt_rq)
break;
}
}
+next:
spin_unlock(&iter->rt_runtime_lock);
}
spin_unlock(&rt_b->rt_runtime_lock);
return more;
}
+
+static void __disable_runtime(struct rq *rq)
+{
+ struct root_domain *rd = rq->rd;
+ struct rt_rq *rt_rq;
+
+ if (unlikely(!scheduler_running))
+ return;
+
+ for_each_leaf_rt_rq(rt_rq, rq) {
+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+ s64 want;
+ int i;
+
+ spin_lock(&rt_b->rt_runtime_lock);
+ spin_lock(&rt_rq->rt_runtime_lock);
+ if (rt_rq->rt_runtime == RUNTIME_INF ||
+ rt_rq->rt_runtime == rt_b->rt_runtime)
+ goto balanced;
+ spin_unlock(&rt_rq->rt_runtime_lock);
+
+ want = rt_b->rt_runtime - rt_rq->rt_runtime;
+
+ for_each_cpu_mask(i, rd->span) {
+ struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
+ s64 diff;
+
+ if (iter == rt_rq)
+ continue;
+
+ spin_lock(&iter->rt_runtime_lock);
+ if (want > 0) {
+ diff = min_t(s64, iter->rt_runtime, want);
+ iter->rt_runtime -= diff;
+ want -= diff;
+ } else {
+ iter->rt_runtime -= want;
+ want -= want;
+ }
+ spin_unlock(&iter->rt_runtime_lock);
+
+ if (!want)
+ break;
+ }
+
+ spin_lock(&rt_rq->rt_runtime_lock);
+ BUG_ON(want);
+balanced:
+ rt_rq->rt_runtime = RUNTIME_INF;
+ spin_unlock(&rt_rq->rt_runtime_lock);
+ spin_unlock(&rt_b->rt_runtime_lock);
+ }
+}
+
+static void disable_runtime(struct rq *rq)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rq->lock, flags);
+ __disable_runtime(rq);
+ spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static void __enable_runtime(struct rq *rq)
+{
+ struct root_domain *rd = rq->rd;
+ struct rt_rq *rt_rq;
+
+ if (unlikely(!scheduler_running))
+ return;
+
+ for_each_leaf_rt_rq(rt_rq, rq) {
+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+
+ spin_lock(&rt_b->rt_runtime_lock);
+ spin_lock(&rt_rq->rt_runtime_lock);
+ rt_rq->rt_runtime = rt_b->rt_runtime;
+ rt_rq->rt_time = 0;
+ spin_unlock(&rt_rq->rt_runtime_lock);
+ spin_unlock(&rt_b->rt_runtime_lock);
+ }
+}
+
+static void enable_runtime(struct rq *rq)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rq->lock, flags);
+ __enable_runtime(rq);
+ spin_unlock_irqrestore(&rq->lock, flags);
+}
+
#endif
static inline int rt_se_prio(struct sched_rt_entity *rt_se)
@@ -328,14 +430,13 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
#ifdef CONFIG_SMP
if (rt_rq->rt_time > runtime) {
- int more;
-
spin_unlock(&rt_rq->rt_runtime_lock);
- more = balance_runtime(rt_rq);
+ balance_runtime(rt_rq);
spin_lock(&rt_rq->rt_runtime_lock);
- if (more)
- runtime = sched_rt_runtime(rt_rq);
+ runtime = sched_rt_runtime(rt_rq);
+ if (runtime == RUNTIME_INF)
+ return 0;
}
#endif
@@ -391,12 +492,21 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
WARN_ON(!rt_prio(rt_se_prio(rt_se)));
rt_rq->rt_nr_running++;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- if (rt_se_prio(rt_se) < rt_rq->highest_prio)
+ if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
rt_rq->highest_prio = rt_se_prio(rt_se);
+#ifdef CONFIG_SMP
+ if (rq->online)
+ cpupri_set(&rq->rd->cpupri, rq->cpu,
+ rt_se_prio(rt_se));
+#endif
+ }
#endif
#ifdef CONFIG_SMP
if (rt_se->nr_cpus_allowed > 1) {
struct rq *rq = rq_of_rt_rq(rt_rq);
+
rq->rt.rt_nr_migratory++;
}
@@ -416,6 +526,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
static inline
void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
+#ifdef CONFIG_SMP
+ int highest_prio = rt_rq->highest_prio;
+#endif
+
WARN_ON(!rt_prio(rt_se_prio(rt_se)));
WARN_ON(!rt_rq->rt_nr_running);
rt_rq->rt_nr_running--;
@@ -439,6 +553,14 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rq->rt.rt_nr_migratory--;
}
+ if (rt_rq->highest_prio != highest_prio) {
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ if (rq->online)
+ cpupri_set(&rq->rd->cpupri, rq->cpu,
+ rt_rq->highest_prio);
+ }
+
update_rt_migration(rq_of_rt_rq(rt_rq));
#endif /* CONFIG_SMP */
#ifdef CONFIG_RT_GROUP_SCHED
@@ -458,7 +580,13 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
if (group_rq && rt_rq_throttled(group_rq))
return;
- list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
+ if (rt_se->nr_cpus_allowed == 1)
+ list_add_tail(&rt_se->run_list,
+ array->xqueue + rt_se_prio(rt_se));
+ else
+ list_add_tail(&rt_se->run_list,
+ array->squeue + rt_se_prio(rt_se));
+
__set_bit(rt_se_prio(rt_se), array->bitmap);
inc_rt_tasks(rt_se, rt_rq);
@@ -470,7 +598,8 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
struct rt_prio_array *array = &rt_rq->active;
list_del_init(&rt_se->run_list);
- if (list_empty(array->queue + rt_se_prio(rt_se)))
+ if (list_empty(array->squeue + rt_se_prio(rt_se))
+ && list_empty(array->xqueue + rt_se_prio(rt_se)))
__clear_bit(rt_se_prio(rt_se), array->bitmap);
dec_rt_tasks(rt_se, rt_rq);
@@ -537,13 +666,19 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
/*
* Put task to the end of the run list without the overhead of dequeue
* followed by enqueue.
+ *
+ * Note: We always enqueue the task to the shared-queue, regardless of its
+ * previous position w.r.t. exclusive vs shared. This is so that exclusive RR
+ * tasks fairly round-robin with all tasks on the runqueue, not just other
+ * exclusive tasks.
*/
static
void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
{
struct rt_prio_array *array = &rt_rq->active;
- list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
+ list_del_init(&rt_se->run_list);
+ list_add_tail(&rt_se->run_list, array->squeue + rt_se_prio(rt_se));
}
static void requeue_task_rt(struct rq *rq, struct task_struct *p)
@@ -601,13 +736,46 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
}
#endif /* CONFIG_SMP */
+static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
+ struct rt_rq *rt_rq);
+
/*
* Preempt the current task with a newly woken task if needed:
*/
static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
{
- if (p->prio < rq->curr->prio)
+ if (p->prio < rq->curr->prio) {
resched_task(rq->curr);
+ return;
+ }
+
+#ifdef CONFIG_SMP
+ /*
+ * If:
+ *
+ * - the newly woken task is of equal priority to the current task
+ * - the newly woken task is non-migratable while current is migratable
+ * - current will be preempted on the next reschedule
+ *
+ * we should check to see if current can readily move to a different
+ * cpu. If so, we will reschedule to allow the push logic to try
+ * to move current somewhere else, making room for our non-migratable
+ * task.
+ */
+ if((p->prio == rq->curr->prio)
+ && p->rt.nr_cpus_allowed == 1
+ && rq->curr->rt.nr_cpus_allowed != 1
+ && pick_next_rt_entity(rq, &rq->rt) != &rq->curr->rt) {
+ cpumask_t mask;
+
+ if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
+ /*
+ * There appears to be other cpus that can accept
+ * current, so lets reschedule to try and push it away
+ */
+ resched_task(rq->curr);
+ }
+#endif
}
static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
@@ -621,8 +789,15 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
idx = sched_find_first_bit(array->bitmap);
BUG_ON(idx >= MAX_RT_PRIO);
- queue = array->queue + idx;
- next = list_entry(queue->next, struct sched_rt_entity, run_list);
+ queue = array->xqueue + idx;
+ if (!list_empty(queue))
+ next = list_entry(queue->next, struct sched_rt_entity,
+ run_list);
+ else {
+ queue = array->squeue + idx;
+ next = list_entry(queue->next, struct sched_rt_entity,
+ run_list);
+ }
return next;
}
@@ -692,7 +867,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
continue;
if (next && next->prio < idx)
continue;
- list_for_each_entry(rt_se, array->queue + idx, run_list) {
+ list_for_each_entry(rt_se, array->squeue + idx, run_list) {
struct task_struct *p = rt_task_of(rt_se);
if (pick_rt_task(rq, p, cpu)) {
next = p;
@@ -710,73 +885,6 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
-static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
-{
- int lowest_prio = -1;
- int lowest_cpu = -1;
- int count = 0;
- int cpu;
-
- cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
-
- /*
- * Scan each rq for the lowest prio.
- */
- for_each_cpu_mask(cpu, *lowest_mask) {
- struct rq *rq = cpu_rq(cpu);
-
- /* We look for lowest RT prio or non-rt CPU */
- if (rq->rt.highest_prio >= MAX_RT_PRIO) {
- /*
- * if we already found a low RT queue
- * and now we found this non-rt queue
- * clear the mask and set our bit.
- * Otherwise just return the queue as is
- * and the count==1 will cause the algorithm
- * to use the first bit found.
- */
- if (lowest_cpu != -1) {
- cpus_clear(*lowest_mask);
- cpu_set(rq->cpu, *lowest_mask);
- }
- return 1;
- }
-
- /* no locking for now */
- if ((rq->rt.highest_prio > task->prio)
- && (rq->rt.highest_prio >= lowest_prio)) {
- if (rq->rt.highest_prio > lowest_prio) {
- /* new low - clear old data */
- lowest_prio = rq->rt.highest_prio;
- lowest_cpu = cpu;
- count = 0;
- }
- count++;
- } else
- cpu_clear(cpu, *lowest_mask);
- }
-
- /*
- * Clear out all the set bits that represent
- * runqueues that were of higher prio than
- * the lowest_prio.
- */
- if (lowest_cpu > 0) {
- /*
- * Perhaps we could add another cpumask op to
- * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
- * Then that could be optimized to use memset and such.
- */
- for_each_cpu_mask(cpu, *lowest_mask) {
- if (cpu >= lowest_cpu)
- break;
- cpu_clear(cpu, *lowest_mask);
- }
- }
-
- return count;
-}
-
static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
{
int first;
@@ -798,17 +906,12 @@ static int find_lowest_rq(struct task_struct *task)
cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
int this_cpu = smp_processor_id();
int cpu = task_cpu(task);
- int count = find_lowest_cpus(task, lowest_mask);
- if (!count)
- return -1; /* No targets found */
+ if (task->rt.nr_cpus_allowed == 1)
+ return -1; /* No other targets possible */
- /*
- * There is no sense in performing an optimal search if only one
- * target is found.
- */
- if (count == 1)
- return first_cpu(*lowest_mask);
+ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
+ return -1; /* No targets found */
/*
* At this point we have built a mask of cpus representing the
@@ -1146,6 +1249,14 @@ static void set_cpus_allowed_rt(struct task_struct *p,
}
update_rt_migration(rq);
+
+ if (unlikely(weight == 1 || p->rt.nr_cpus_allowed == 1))
+ /*
+ * If either the new or old weight is a "1", we need
+ * to requeue to properly move between shared and
+ * exclusive queues.
+ */
+ requeue_task_rt(rq, p);
}
p->cpus_allowed = *new_mask;
@@ -1153,17 +1264,25 @@ static void set_cpus_allowed_rt(struct task_struct *p,
}
/* Assumes rq->lock is held */
-static void join_domain_rt(struct rq *rq)
+static void rq_online_rt(struct rq *rq)
{
if (rq->rt.overloaded)
rt_set_overload(rq);
+
+ __enable_runtime(rq);
+
+ cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
}
/* Assumes rq->lock is held */
-static void leave_domain_rt(struct rq *rq)
+static void rq_offline_rt(struct rq *rq)
{
if (rq->rt.overloaded)
rt_clear_overload(rq);
+
+ __disable_runtime(rq);
+
+ cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
}
/*
@@ -1326,8 +1445,8 @@ static const struct sched_class rt_sched_class = {
.load_balance = load_balance_rt,
.move_one_task = move_one_task_rt,
.set_cpus_allowed = set_cpus_allowed_rt,
- .join_domain = join_domain_rt,
- .leave_domain = leave_domain_rt,
+ .rq_online = rq_online_rt,
+ .rq_offline = rq_offline_rt,
.pre_schedule = pre_schedule_rt,
.post_schedule = post_schedule_rt,
.task_wake_up = task_wake_up_rt,