summaryrefslogtreecommitdiff
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c370
1 files changed, 301 insertions, 69 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 17c667b427b4..75554f366fd3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -16,6 +16,7 @@
#include <linux/init_task.h>
#include <linux/context_tracking.h>
#include <linux/rcupdate_wait.h>
+#include <linux/compat.h>
#include <linux/blkdev.h>
#include <linux/kprobes.h>
@@ -26,6 +27,7 @@
#include <linux/profile.h>
#include <linux/security.h>
#include <linux/syscalls.h>
+#include <linux/sched/isolation.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -42,18 +44,21 @@
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
/*
* Debugging: various feature bits
+ *
+ * If SCHED_DEBUG is disabled, each compilation unit has its own copy of
+ * sysctl_sched_features, defined in sched.h, to allow constants propagation
+ * at compile time and compiler optimization based on features default.
*/
-
#define SCHED_FEAT(name, enabled) \
(1UL << __SCHED_FEAT_##name) * enabled |
-
const_debug unsigned int sysctl_sched_features =
#include "features.h"
0;
-
#undef SCHED_FEAT
+#endif
/*
* Number of tasks to iterate in a single balance run.
@@ -83,9 +88,6 @@ __read_mostly int scheduler_running;
*/
int sysctl_sched_rt_runtime = 950000;
-/* CPUs with isolated domains */
-cpumask_var_t cpu_isolated_map;
-
/*
* __task_rq_lock - lock the rq @p resides on.
*/
@@ -505,8 +507,7 @@ void resched_cpu(int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- if (!raw_spin_trylock_irqsave(&rq->lock, flags))
- return;
+ raw_spin_lock_irqsave(&rq->lock, flags);
resched_curr(rq);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -526,7 +527,7 @@ int get_nohz_timer_target(void)
int i, cpu = smp_processor_id();
struct sched_domain *sd;
- if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
+ if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
return cpu;
rcu_read_lock();
@@ -535,15 +536,15 @@ int get_nohz_timer_target(void)
if (cpu == i)
continue;
- if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
+ if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
cpu = i;
goto unlock;
}
}
}
- if (!is_housekeeping_cpu(cpu))
- cpu = housekeeping_any_cpu();
+ if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
+ cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
unlock:
rcu_read_unlock();
return cpu;
@@ -733,7 +734,7 @@ int tg_nop(struct task_group *tg, void *data)
}
#endif
-static void set_load_weight(struct task_struct *p)
+static void set_load_weight(struct task_struct *p, bool update_load)
{
int prio = p->static_prio - MAX_RT_PRIO;
struct load_weight *load = &p->se.load;
@@ -747,8 +748,16 @@ static void set_load_weight(struct task_struct *p)
return;
}
- load->weight = scale_load(sched_prio_to_weight[prio]);
- load->inv_weight = sched_prio_to_wmult[prio];
+ /*
+ * SCHED_OTHER tasks have to update their load when changing their
+ * weight
+ */
+ if (update_load && p->sched_class == &fair_sched_class) {
+ reweight_task(p, prio);
+ } else {
+ load->weight = scale_load(sched_prio_to_weight[prio]);
+ load->inv_weight = sched_prio_to_wmult[prio];
+ }
}
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -951,8 +960,13 @@ struct migration_arg {
static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
struct task_struct *p, int dest_cpu)
{
- if (unlikely(!cpu_active(dest_cpu)))
- return rq;
+ if (p->flags & PF_KTHREAD) {
+ if (unlikely(!cpu_online(dest_cpu)))
+ return rq;
+ } else {
+ if (unlikely(!cpu_active(dest_cpu)))
+ return rq;
+ }
/* Affinity changed (again). */
if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
@@ -1168,6 +1182,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
lockdep_is_held(&task_rq(p)->lock)));
#endif
+ /*
+ * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
+ */
+ WARN_ON_ONCE(!cpu_online(new_cpu));
#endif
trace_sched_migrate_task(p, new_cpu);
@@ -1967,8 +1985,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* reordered with p->state check below. This pairs with mb() in
* set_current_state() the waiting thread does.
*/
- smp_mb__before_spinlock();
raw_spin_lock_irqsave(&p->pi_lock, flags);
+ smp_mb__after_spinlock();
if (!(p->state & state))
goto out;
@@ -2069,7 +2087,7 @@ out:
/**
* try_to_wake_up_local - try to wake up a local task with rq lock held
* @p: the thread to be awakened
- * @cookie: context's cookie for pinning
+ * @rf: request-queue flags for pinning
*
* Put @p on the run-queue if it's not already there. The caller must
* ensure that this_rq() is locked, @p is bound to this_rq() and not
@@ -2349,7 +2367,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->static_prio = NICE_TO_PRIO(0);
p->prio = p->normal_prio = __normal_prio(p);
- set_load_weight(p);
+ set_load_weight(p, false);
/*
* We don't need the reset flag anymore after the fork. It has
@@ -2635,6 +2653,16 @@ static struct rq *finish_task_switch(struct task_struct *prev)
prev_state = prev->state;
vtime_task_switch(prev);
perf_event_task_sched_in(prev, current);
+ /*
+ * The membarrier system call requires a full memory barrier
+ * after storing to rq->curr, before going back to user-space.
+ *
+ * TODO: This smp_mb__after_unlock_lock can go away if PPC end
+ * up adding a full barrier to switch_mm(), or we should figure
+ * out if a smp_mb__after_unlock_lock is really the proper API
+ * to use.
+ */
+ smp_mb__after_unlock_lock();
finish_lock_switch(rq, prev);
finish_arch_post_lock_switch();
@@ -3281,8 +3309,8 @@ static void __sched notrace __schedule(bool preempt)
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
* done by the caller to avoid the race with signal_wake_up().
*/
- smp_mb__before_spinlock();
rq_lock(rq, &rf);
+ smp_mb__after_spinlock();
/* Promote REQ to ACT */
rq->clock_update_flags <<= 1;
@@ -3324,6 +3352,21 @@ static void __sched notrace __schedule(bool preempt)
if (likely(prev != next)) {
rq->nr_switches++;
rq->curr = next;
+ /*
+ * The membarrier system call requires each architecture
+ * to have a full memory barrier after updating
+ * rq->curr, before returning to user-space. For TSO
+ * (e.g. x86), the architecture must provide its own
+ * barrier in switch_mm(). For weakly ordered machines
+ * for which spin_unlock() acts as a full memory
+ * barrier, finish_lock_switch() in common code takes
+ * care of this barrier. For weakly ordered machines for
+ * which spin_unlock() acts as a RELEASE barrier (only
+ * arm64 and PowerPC), arm64 has a full barrier in
+ * switch_to(), and PowerPC has
+ * smp_mb__after_unlock_lock() before
+ * finish_lock_switch().
+ */
++*switch_count;
trace_sched_switch(preempt, prev, next);
@@ -3352,8 +3395,8 @@ void __noreturn do_task_dead(void)
* To avoid it, we have to wait for releasing tsk->pi_lock which
* is held by try_to_wake_up()
*/
- smp_mb();
- raw_spin_unlock_wait(&current->pi_lock);
+ raw_spin_lock_irq(&current->pi_lock);
+ raw_spin_unlock_irq(&current->pi_lock);
/* Causes final put_task_struct in finish_task_switch(): */
__set_current_state(TASK_DEAD);
@@ -3771,7 +3814,7 @@ void set_user_nice(struct task_struct *p, long nice)
put_prev_task(rq, p);
p->static_prio = NICE_TO_PRIO(nice);
- set_load_weight(p);
+ set_load_weight(p, true);
old_prio = p->prio;
p->prio = effective_prio(p);
delta = p->prio - old_prio;
@@ -3928,7 +3971,7 @@ static void __setscheduler_params(struct task_struct *p,
*/
p->rt_priority = attr->sched_priority;
p->normal_prio = normal_prio(p);
- set_load_weight(p);
+ set_load_weight(p, true);
}
/* Actually do priority change: must hold pi & rq lock. */
@@ -4808,6 +4851,7 @@ int __sched _cond_resched(void)
preempt_schedule_common();
return 1;
}
+ rcu_all_qs();
return 0;
}
EXPORT_SYMBOL(_cond_resched);
@@ -5064,13 +5108,11 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
* Return: On success, 0 and the timeslice is in @interval. Otherwise,
* an error code.
*/
-SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
- struct timespec __user *, interval)
+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
{
struct task_struct *p;
unsigned int time_slice;
struct rq_flags rf;
- struct timespec t;
struct rq *rq;
int retval;
@@ -5094,33 +5136,51 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
task_rq_unlock(rq, p, &rf);
rcu_read_unlock();
- jiffies_to_timespec(time_slice, &t);
- retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
- return retval;
+ jiffies_to_timespec64(time_slice, t);
+ return 0;
out_unlock:
rcu_read_unlock();
return retval;
}
-static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+ struct timespec __user *, interval)
+{
+ struct timespec64 t;
+ int retval = sched_rr_get_interval(pid, &t);
+
+ if (retval == 0)
+ retval = put_timespec64(&t, interval);
+
+ return retval;
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
+ compat_pid_t, pid,
+ struct compat_timespec __user *, interval)
+{
+ struct timespec64 t;
+ int retval = sched_rr_get_interval(pid, &t);
+
+ if (retval == 0)
+ retval = compat_put_timespec64(&t, interval);
+ return retval;
+}
+#endif
void sched_show_task(struct task_struct *p)
{
unsigned long free = 0;
int ppid;
- unsigned long state = p->state;
-
- /* Make sure the string lines up properly with the number of task states: */
- BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1);
if (!try_get_task_stack(p))
return;
- if (state)
- state = __ffs(state) + 1;
- printk(KERN_INFO "%-15.15s %c", p->comm,
- state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
- if (state == TASK_RUNNING)
+
+ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
+
+ if (p->state == TASK_RUNNING)
printk(KERN_CONT " running task ");
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
@@ -5138,6 +5198,29 @@ void sched_show_task(struct task_struct *p)
show_stack(p, NULL);
put_task_stack(p);
}
+EXPORT_SYMBOL_GPL(sched_show_task);
+
+static inline bool
+state_filter_match(unsigned long state_filter, struct task_struct *p)
+{
+ /* no filter, everything matches */
+ if (!state_filter)
+ return true;
+
+ /* filter, but doesn't match */
+ if (!(p->state & state_filter))
+ return false;
+
+ /*
+ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
+ * TASK_KILLABLE).
+ */
+ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
+ return false;
+
+ return true;
+}
+
void show_state_filter(unsigned long state_filter)
{
@@ -5161,7 +5244,7 @@ void show_state_filter(unsigned long state_filter)
*/
touch_nmi_watchdog();
touch_all_softlockup_watchdogs();
- if (!state_filter || (p->state & state_filter))
+ if (state_filter_match(state_filter, p))
sched_show_task(p);
}
@@ -5177,11 +5260,6 @@ void show_state_filter(unsigned long state_filter)
debug_show_all_locks();
}
-void init_idle_bootup_task(struct task_struct *idle)
-{
- idle->sched_class = &idle_sched_class;
-}
-
/**
* init_idle - set up an idle thread for a given CPU
* @idle: task in question
@@ -5438,7 +5516,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
*/
next = pick_next_task(rq, &fake_task, rf);
BUG_ON(!next);
- next->sched_class->put_prev_task(rq, next);
+ put_prev_task(rq, next);
/*
* Rules for changing task_struct::cpus_allowed are holding
@@ -5538,16 +5616,15 @@ static void cpuset_cpu_active(void)
* operation in the resume sequence, just build a single sched
* domain, ignoring cpusets.
*/
- num_cpus_frozen--;
- if (likely(num_cpus_frozen)) {
- partition_sched_domains(1, NULL, NULL);
+ partition_sched_domains(1, NULL, NULL);
+ if (--num_cpus_frozen)
return;
- }
/*
* This is the last CPU online operation. So fall through and
* restore the original sched domains by considering the
* cpuset configurations.
*/
+ cpuset_force_rebuild();
}
cpuset_update_active_cpus();
}
@@ -5683,10 +5760,6 @@ static inline void sched_init_smt(void) { }
void __init sched_init_smp(void)
{
- cpumask_var_t non_isolated_cpus;
-
- alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
-
sched_init_numa();
/*
@@ -5696,16 +5769,12 @@ void __init sched_init_smp(void)
*/
mutex_lock(&sched_domains_mutex);
sched_init_domains(cpu_active_mask);
- cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
- if (cpumask_empty(non_isolated_cpus))
- cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
mutex_unlock(&sched_domains_mutex);
/* Move init over to a non-isolated CPU */
- if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
BUG();
sched_init_granularity();
- free_cpumask_var(non_isolated_cpus);
init_sched_rt_class();
init_sched_dl_class();
@@ -5890,7 +5959,7 @@ void __init sched_init(void)
atomic_set(&rq->nr_iowait, 0);
}
- set_load_weight(&init_task);
+ set_load_weight(&init_task, false);
/*
* The boot idle thread does lazy MMU switching as well:
@@ -5909,9 +5978,6 @@ void __init sched_init(void)
calc_load_update = jiffies + LOAD_FREQ;
#ifdef CONFIG_SMP
- /* May be allocated at isolcpus cmdline parse time */
- if (cpu_isolated_map == NULL)
- zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
idle_thread_set_boot_cpu();
set_cpu_rq_start_time(smp_processor_id());
#endif
@@ -6578,7 +6644,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
return ret;
}
-static int cpu_stats_show(struct seq_file *sf, void *v)
+static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
{
struct task_group *tg = css_tg(seq_css(sf));
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
@@ -6618,7 +6684,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
}
#endif /* CONFIG_RT_GROUP_SCHED */
-static struct cftype cpu_files[] = {
+static struct cftype cpu_legacy_files[] = {
#ifdef CONFIG_FAIR_GROUP_SCHED
{
.name = "shares",
@@ -6639,7 +6705,7 @@ static struct cftype cpu_files[] = {
},
{
.name = "stat",
- .seq_show = cpu_stats_show,
+ .seq_show = cpu_cfs_stat_show,
},
#endif
#ifdef CONFIG_RT_GROUP_SCHED
@@ -6657,16 +6723,182 @@ static struct cftype cpu_files[] = {
{ } /* Terminate */
};
+static int cpu_extra_stat_show(struct seq_file *sf,
+ struct cgroup_subsys_state *css)
+{
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ struct task_group *tg = css_tg(css);
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+ u64 throttled_usec;
+
+ throttled_usec = cfs_b->throttled_time;
+ do_div(throttled_usec, NSEC_PER_USEC);
+
+ seq_printf(sf, "nr_periods %d\n"
+ "nr_throttled %d\n"
+ "throttled_usec %llu\n",
+ cfs_b->nr_periods, cfs_b->nr_throttled,
+ throttled_usec);
+ }
+#endif
+ return 0;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct task_group *tg = css_tg(css);
+ u64 weight = scale_load_down(tg->shares);
+
+ return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
+}
+
+static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 weight)
+{
+ /*
+ * cgroup weight knobs should use the common MIN, DFL and MAX
+ * values which are 1, 100 and 10000 respectively. While it loses
+ * a bit of range on both ends, it maps pretty well onto the shares
+ * value used by scheduler and the round-trip conversions preserve
+ * the original value over the entire range.
+ */
+ if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
+ return -ERANGE;
+
+ weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
+
+ return sched_group_set_shares(css_tg(css), scale_load(weight));
+}
+
+static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ unsigned long weight = scale_load_down(css_tg(css)->shares);
+ int last_delta = INT_MAX;
+ int prio, delta;
+
+ /* find the closest nice value to the current weight */
+ for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
+ delta = abs(sched_prio_to_weight[prio] - weight);
+ if (delta >= last_delta)
+ break;
+ last_delta = delta;
+ }
+
+ return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
+}
+
+static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 nice)
+{
+ unsigned long weight;
+
+ if (nice < MIN_NICE || nice > MAX_NICE)
+ return -ERANGE;
+
+ weight = sched_prio_to_weight[NICE_TO_PRIO(nice) - MAX_RT_PRIO];
+ return sched_group_set_shares(css_tg(css), scale_load(weight));
+}
+#endif
+
+static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
+ long period, long quota)
+{
+ if (quota < 0)
+ seq_puts(sf, "max");
+ else
+ seq_printf(sf, "%ld", quota);
+
+ seq_printf(sf, " %ld\n", period);
+}
+
+/* caller should put the current value in *@periodp before calling */
+static int __maybe_unused cpu_period_quota_parse(char *buf,
+ u64 *periodp, u64 *quotap)
+{
+ char tok[21]; /* U64_MAX */
+
+ if (!sscanf(buf, "%s %llu", tok, periodp))
+ return -EINVAL;
+
+ *periodp *= NSEC_PER_USEC;
+
+ if (sscanf(tok, "%llu", quotap))
+ *quotap *= NSEC_PER_USEC;
+ else if (!strcmp(tok, "max"))
+ *quotap = RUNTIME_INF;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+static int cpu_max_show(struct seq_file *sf, void *v)
+{
+ struct task_group *tg = css_tg(seq_css(sf));
+
+ cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
+ return 0;
+}
+
+static ssize_t cpu_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct task_group *tg = css_tg(of_css(of));
+ u64 period = tg_get_cfs_period(tg);
+ u64 quota;
+ int ret;
+
+ ret = cpu_period_quota_parse(buf, &period, &quota);
+ if (!ret)
+ ret = tg_set_cfs_bandwidth(tg, period, quota);
+ return ret ?: nbytes;
+}
+#endif
+
+static struct cftype cpu_files[] = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ {
+ .name = "weight",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = cpu_weight_read_u64,
+ .write_u64 = cpu_weight_write_u64,
+ },
+ {
+ .name = "weight.nice",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = cpu_weight_nice_read_s64,
+ .write_s64 = cpu_weight_nice_write_s64,
+ },
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ .name = "max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_max_show,
+ .write = cpu_max_write,
+ },
+#endif
+ { } /* terminate */
+};
+
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
.css_online = cpu_cgroup_css_online,
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
+ .css_extra_stat_show = cpu_extra_stat_show,
.fork = cpu_cgroup_fork,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
- .legacy_cftypes = cpu_files,
+ .legacy_cftypes = cpu_legacy_files,
+ .dfl_cftypes = cpu_files,
.early_init = true,
+ .threaded = true,
};
#endif /* CONFIG_CGROUP_SCHED */