summaryrefslogtreecommitdiff
path: root/kernel/sched_fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r--kernel/sched_fair.c189
1 files changed, 62 insertions, 127 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5a5ea2cd924f..49ad99378f82 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -35,8 +35,8 @@
* (to see the precise effective timeslice length of your workload,
* run vmstat and monitor the context-switches (cs) field)
*/
-unsigned int sysctl_sched_latency = 5000000ULL;
-unsigned int normalized_sysctl_sched_latency = 5000000ULL;
+unsigned int sysctl_sched_latency = 6000000ULL;
+unsigned int normalized_sysctl_sched_latency = 6000000ULL;
/*
* The initial- and re-scaling of tunables is configurable
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
/*
* Minimal preemption granularity for CPU-bound tasks:
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_min_granularity = 1000000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
+unsigned int sysctl_sched_min_granularity = 2000000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL;
/*
* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
*/
-static unsigned int sched_nr_latency = 5;
+static unsigned int sched_nr_latency = 3;
/*
* After fork, child runs first. If set to 0 (default) then
@@ -505,7 +505,8 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
{
unsigned long delta_exec_weighted;
- schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
+ schedstat_set(curr->statistics.exec_max,
+ max((u64)delta_exec, curr->statistics.exec_max));
curr->sum_exec_runtime += delta_exec;
schedstat_add(cfs_rq, exec_clock, delta_exec);
@@ -548,7 +549,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
+ schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
}
/*
@@ -567,18 +568,18 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
static void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- schedstat_set(se->wait_max, max(se->wait_max,
- rq_of(cfs_rq)->clock - se->wait_start));
- schedstat_set(se->wait_count, se->wait_count + 1);
- schedstat_set(se->wait_sum, se->wait_sum +
- rq_of(cfs_rq)->clock - se->wait_start);
+ schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
+ rq_of(cfs_rq)->clock - se->statistics.wait_start));
+ schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
+ schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
+ rq_of(cfs_rq)->clock - se->statistics.wait_start);
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
trace_sched_stat_wait(task_of(se),
- rq_of(cfs_rq)->clock - se->wait_start);
+ rq_of(cfs_rq)->clock - se->statistics.wait_start);
}
#endif
- schedstat_set(se->wait_start, 0);
+ schedstat_set(se->statistics.wait_start, 0);
}
static inline void
@@ -657,39 +658,39 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
if (entity_is_task(se))
tsk = task_of(se);
- if (se->sleep_start) {
- u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
+ if (se->statistics.sleep_start) {
+ u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
if ((s64)delta < 0)
delta = 0;
- if (unlikely(delta > se->sleep_max))
- se->sleep_max = delta;
+ if (unlikely(delta > se->statistics.sleep_max))
+ se->statistics.sleep_max = delta;
- se->sleep_start = 0;
- se->sum_sleep_runtime += delta;
+ se->statistics.sleep_start = 0;
+ se->statistics.sum_sleep_runtime += delta;
if (tsk) {
account_scheduler_latency(tsk, delta >> 10, 1);
trace_sched_stat_sleep(tsk, delta);
}
}
- if (se->block_start) {
- u64 delta = rq_of(cfs_rq)->clock - se->block_start;
+ if (se->statistics.block_start) {
+ u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
if ((s64)delta < 0)
delta = 0;
- if (unlikely(delta > se->block_max))
- se->block_max = delta;
+ if (unlikely(delta > se->statistics.block_max))
+ se->statistics.block_max = delta;
- se->block_start = 0;
- se->sum_sleep_runtime += delta;
+ se->statistics.block_start = 0;
+ se->statistics.sum_sleep_runtime += delta;
if (tsk) {
if (tsk->in_iowait) {
- se->iowait_sum += delta;
- se->iowait_count++;
+ se->statistics.iowait_sum += delta;
+ se->statistics.iowait_count++;
trace_sched_stat_iowait(tsk, delta);
}
@@ -737,20 +738,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
vruntime += sched_vslice(cfs_rq, se);
/* sleeps up to a single latency don't count. */
- if (!initial && sched_feat(FAIR_SLEEPERS)) {
+ if (!initial) {
unsigned long thresh = sysctl_sched_latency;
/*
- * Convert the sleeper threshold into virtual time.
- * SCHED_IDLE is a special sub-class. We care about
- * fairness only relative to other SCHED_IDLE tasks,
- * all of which have the same weight.
- */
- if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
- task_of(se)->policy != SCHED_IDLE))
- thresh = calc_delta_fair(thresh, se);
-
- /*
* Halve their sleep time's effect, to allow
* for a gentler effect of sleepers:
*/
@@ -826,9 +817,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
struct task_struct *tsk = task_of(se);
if (tsk->state & TASK_INTERRUPTIBLE)
- se->sleep_start = rq_of(cfs_rq)->clock;
+ se->statistics.sleep_start = rq_of(cfs_rq)->clock;
if (tsk->state & TASK_UNINTERRUPTIBLE)
- se->block_start = rq_of(cfs_rq)->clock;
+ se->statistics.block_start = rq_of(cfs_rq)->clock;
}
#endif
}
@@ -912,7 +903,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* when there are only lesser-weight tasks around):
*/
if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
- se->slice_max = max(se->slice_max,
+ se->statistics.slice_max = max(se->statistics.slice_max,
se->sum_exec_runtime - se->prev_sum_exec_runtime);
}
#endif
@@ -1240,7 +1231,6 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
{
- struct task_struct *curr = current;
unsigned long this_load, load;
int idx, this_cpu, prev_cpu;
unsigned long tl_per_task;
@@ -1255,18 +1245,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
load = source_load(prev_cpu, idx);
this_load = target_load(this_cpu, idx);
- if (sync) {
- if (sched_feat(SYNC_LESS) &&
- (curr->se.avg_overlap > sysctl_sched_migration_cost ||
- p->se.avg_overlap > sysctl_sched_migration_cost))
- sync = 0;
- } else {
- if (sched_feat(SYNC_MORE) &&
- (curr->se.avg_overlap < sysctl_sched_migration_cost &&
- p->se.avg_overlap < sysctl_sched_migration_cost))
- sync = 1;
- }
-
/*
* If sync wakeup then subtract the (maximum possible)
* effect of the currently running task from the load
@@ -1306,7 +1284,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
if (sync && balanced)
return 1;
- schedstat_inc(p, se.nr_wakeups_affine_attempts);
+ schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
tl_per_task = cpu_avg_load_per_task(this_cpu);
if (balanced ||
@@ -1318,7 +1296,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
* there is no bad imbalance.
*/
schedstat_inc(sd, ttwu_move_affine);
- schedstat_inc(p, se.nr_wakeups_affine);
+ schedstat_inc(p, se.statistics.nr_wakeups_affine);
return 1;
}
@@ -1451,13 +1429,12 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
int new_cpu = cpu;
- int want_affine = 0;
+ int want_affine = 0, cpu_idle = !current->pid;
int want_sd = 1;
int sync = wake_flags & WF_SYNC;
if (sd_flag & SD_BALANCE_WAKE) {
- if (sched_feat(AFFINE_WAKEUPS) &&
- cpumask_test_cpu(cpu, &p->cpus_allowed))
+ if (cpumask_test_cpu(cpu, &p->cpus_allowed))
want_affine = 1;
new_cpu = prev_cpu;
}
@@ -1509,13 +1486,15 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
* If there's an idle sibling in this domain, make that
* the wake_affine target instead of the current cpu.
*/
- if (tmp->flags & SD_SHARE_PKG_RESOURCES)
+ if (!cpu_idle && tmp->flags & SD_SHARE_PKG_RESOURCES)
target = select_idle_sibling(p, tmp, target);
if (target >= 0) {
if (tmp->flags & SD_WAKE_AFFINE) {
affine_sd = tmp;
want_affine = 0;
+ if (target != cpu)
+ cpu_idle = 1;
}
cpu = target;
}
@@ -1531,6 +1510,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
sd = tmp;
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
if (sched_feat(LB_SHARES_UPDATE)) {
/*
* Pick the largest domain to update shares over
@@ -1544,9 +1524,12 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
if (tmp)
update_shares(tmp);
}
+#endif
- if (affine_sd && wake_affine(affine_sd, p, sync))
- return cpu;
+ if (affine_sd) {
+ if (cpu_idle || cpu == prev_cpu || wake_affine(affine_sd, p, sync))
+ return cpu;
+ }
while (sd) {
int load_idx = sd->forkexec_idx;
@@ -1591,63 +1574,26 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
}
#endif /* CONFIG_SMP */
-/*
- * Adaptive granularity
- *
- * se->avg_wakeup gives the average time a task runs until it does a wakeup,
- * with the limit of wakeup_gran -- when it never does a wakeup.
- *
- * So the smaller avg_wakeup is the faster we want this task to preempt,
- * but we don't want to treat the preemptee unfairly and therefore allow it
- * to run for at least the amount of time we'd like to run.
- *
- * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
- *
- * NOTE: we use *nr_running to scale with load, this nicely matches the
- * degrading latency on load.
- */
-static unsigned long
-adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
-{
- u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
- u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
- u64 gran = 0;
-
- if (this_run < expected_wakeup)
- gran = expected_wakeup - this_run;
-
- return min_t(s64, gran, sysctl_sched_wakeup_granularity);
-}
-
static unsigned long
wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
{
unsigned long gran = sysctl_sched_wakeup_granularity;
- if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
- gran = adaptive_gran(curr, se);
-
/*
* Since its curr running now, convert the gran from real-time
* to virtual-time in his units.
+ *
+ * By using 'se' instead of 'curr' we penalize light tasks, so
+ * they get preempted easier. That is, if 'se' < 'curr' then
+ * the resulting gran will be larger, therefore penalizing the
+ * lighter, if otoh 'se' > 'curr' then the resulting gran will
+ * be smaller, again penalizing the lighter task.
+ *
+ * This is especially important for buddies when the leftmost
+ * task is higher priority than the buddy.
*/
- if (sched_feat(ASYM_GRAN)) {
- /*
- * By using 'se' instead of 'curr' we penalize light tasks, so
- * they get preempted easier. That is, if 'se' < 'curr' then
- * the resulting gran will be larger, therefore penalizing the
- * lighter, if otoh 'se' > 'curr' then the resulting gran will
- * be smaller, again penalizing the lighter task.
- *
- * This is especially important for buddies when the leftmost
- * task is higher priority than the buddy.
- */
- if (unlikely(se->load.weight != NICE_0_LOAD))
- gran = calc_delta_fair(gran, se);
- } else {
- if (unlikely(curr->load.weight != NICE_0_LOAD))
- gran = calc_delta_fair(gran, curr);
- }
+ if (unlikely(se->load.weight != NICE_0_LOAD))
+ gran = calc_delta_fair(gran, se);
return gran;
}
@@ -1705,7 +1651,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
struct task_struct *curr = rq->curr;
struct sched_entity *se = &curr->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
- int sync = wake_flags & WF_SYNC;
int scale = cfs_rq->nr_running >= sched_nr_latency;
if (unlikely(rt_prio(p->prio)))
@@ -1738,14 +1683,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
if (unlikely(curr->policy == SCHED_IDLE))
goto preempt;
- if (sched_feat(WAKEUP_SYNC) && sync)
- goto preempt;
-
- if (sched_feat(WAKEUP_OVERLAP) &&
- se->avg_overlap < sysctl_sched_migration_cost &&
- pse->avg_overlap < sysctl_sched_migration_cost)
- goto preempt;
-
if (!sched_feat(WAKEUP_PREEMPT))
return;
@@ -1844,13 +1781,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
* 3) are cache-hot on their current CPU.
*/
if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
- schedstat_inc(p, se.nr_failed_migrations_affine);
+ schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
return 0;
}
*all_pinned = 0;
if (task_running(rq, p)) {
- schedstat_inc(p, se.nr_failed_migrations_running);
+ schedstat_inc(p, se.statistics.nr_failed_migrations_running);
return 0;
}
@@ -1866,14 +1803,14 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
#ifdef CONFIG_SCHEDSTATS
if (tsk_cache_hot) {
schedstat_inc(sd, lb_hot_gained[idle]);
- schedstat_inc(p, se.nr_forced_migrations);
+ schedstat_inc(p, se.statistics.nr_forced_migrations);
}
#endif
return 1;
}
if (tsk_cache_hot) {
- schedstat_inc(p, se.nr_failed_migrations_hot);
+ schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
return 0;
}
return 1;
@@ -3112,8 +3049,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
/* move a task from busiest_rq to target_rq */
double_lock_balance(busiest_rq, target_rq);
- update_rq_clock(busiest_rq);
- update_rq_clock(target_rq);
/* Search for an sd spanning us and the target CPU. */
for_each_domain(target_cpu, sd) {