summaryrefslogtreecommitdiff
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c166
1 files changed, 103 insertions, 63 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 79f574dba096..2f0a0be4d344 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1139,6 +1139,47 @@ static unsigned int task_scan_max(struct task_struct *p)
return max(smin, smax);
}
+void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+{
+ int mm_users = 0;
+ struct mm_struct *mm = p->mm;
+
+ if (mm) {
+ mm_users = atomic_read(&mm->mm_users);
+ if (mm_users == 1) {
+ mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+ mm->numa_scan_seq = 0;
+ }
+ }
+ p->node_stamp = 0;
+ p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
+ p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+ p->numa_work.next = &p->numa_work;
+ p->numa_faults = NULL;
+ p->numa_group = NULL;
+ p->last_task_numa_placement = 0;
+ p->last_sum_exec_runtime = 0;
+
+ /* New address space, reset the preferred nid */
+ if (!(clone_flags & CLONE_VM)) {
+ p->numa_preferred_nid = -1;
+ return;
+ }
+
+ /*
+ * New thread, keep existing numa_preferred_nid which should be copied
+ * already by arch_dup_task_struct but stagger when scans start.
+ */
+ if (mm) {
+ unsigned int delay;
+
+ delay = min_t(unsigned int, task_scan_max(current),
+ current->numa_scan_period * mm_users * NSEC_PER_MSEC);
+ delay += 2 * TICK_NSEC;
+ p->node_stamp = delay;
+ }
+}
+
static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
rq->nr_numa_running += (p->numa_preferred_nid != -1);
@@ -3941,18 +3982,10 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
if (!sched_feat(UTIL_EST))
return;
- /*
- * Update root cfs_rq's estimated utilization
- *
- * If *p is the last task then the root cfs_rq's estimated utilization
- * of a CPU is 0 by definition.
- */
- ue.enqueued = 0;
- if (cfs_rq->nr_running) {
- ue.enqueued = cfs_rq->avg.util_est.enqueued;
- ue.enqueued -= min_t(unsigned int, ue.enqueued,
- (_task_util_est(p) | UTIL_AVG_UNCHANGED));
- }
+ /* Update root cfs_rq's estimated utilization */
+ ue.enqueued = cfs_rq->avg.util_est.enqueued;
+ ue.enqueued -= min_t(unsigned int, ue.enqueued,
+ (_task_util_est(p) | UTIL_AVG_UNCHANGED));
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
/*
@@ -4549,6 +4582,7 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
now = sched_clock_cpu(smp_processor_id());
cfs_b->runtime = cfs_b->quota;
cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
+ cfs_b->expires_seq++;
}
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4571,6 +4605,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
struct task_group *tg = cfs_rq->tg;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
u64 amount = 0, min_amount, expires;
+ int expires_seq;
/* note: this is a positive sum as runtime_remaining <= 0 */
min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -4587,6 +4622,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_b->idle = 0;
}
}
+ expires_seq = cfs_b->expires_seq;
expires = cfs_b->runtime_expires;
raw_spin_unlock(&cfs_b->lock);
@@ -4596,8 +4632,10 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
* spread between our sched_clock and the one on which runtime was
* issued.
*/
- if ((s64)(expires - cfs_rq->runtime_expires) > 0)
+ if (cfs_rq->expires_seq != expires_seq) {
+ cfs_rq->expires_seq = expires_seq;
cfs_rq->runtime_expires = expires;
+ }
return cfs_rq->runtime_remaining > 0;
}
@@ -4623,12 +4661,9 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
* has not truly expired.
*
* Fortunately we can check determine whether this the case by checking
- * whether the global deadline has advanced. It is valid to compare
- * cfs_b->runtime_expires without any locks since we only care about
- * exact equality, so a partial write will still work.
+ * whether the global deadline(cfs_b->expires_seq) has advanced.
*/
-
- if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
+ if (cfs_rq->expires_seq == cfs_b->expires_seq) {
/* extend local deadline, drift is bounded above by 2 ticks */
cfs_rq->runtime_expires += TICK_NSEC;
} else {
@@ -5161,13 +5196,18 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
+ u64 overrun;
+
lockdep_assert_held(&cfs_b->lock);
- if (!cfs_b->period_active) {
- cfs_b->period_active = 1;
- hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
- hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
- }
+ if (cfs_b->period_active)
+ return;
+
+ cfs_b->period_active = 1;
+ overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
+ cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
+ cfs_b->expires_seq++;
+ hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
}
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -5345,6 +5385,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct sched_entity *se = &p->se;
/*
+ * The code below (indirectly) updates schedutil which looks at
+ * the cfs_rq utilization to select a frequency.
+ * Let's add the task's estimated utilization to the cfs_rq's
+ * estimated utilization, before we update schedutil.
+ */
+ util_est_enqueue(&rq->cfs, p);
+
+ /*
* If in_iowait is set, the code below may not trigger any cpufreq
* utilization updates, so do it here explicitly with the IOWAIT flag
* passed.
@@ -5385,7 +5433,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se)
add_nr_running(rq, 1);
- util_est_enqueue(&rq->cfs, p);
hrtick_update(rq);
}
@@ -5858,8 +5905,8 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
* a cpufreq perspective, it's better to have higher utilisation
* on one CPU.
*/
- if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
- return idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
+ if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
+ return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
if (sync && cpu_rq(this_cpu)->nr_running == 1)
return this_cpu;
@@ -6102,7 +6149,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
- if (idle_cpu(i)) {
+ if (available_idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
if (idle && idle->exit_latency < min_exit_latency) {
@@ -6144,6 +6191,13 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
return prev_cpu;
+ /*
+ * We need task's util for capacity_spare_wake, sync it up to prev_cpu's
+ * last_update_time.
+ */
+ if (!(sd_flag & SD_BALANCE_FORK))
+ sync_entity_load_avg(&p->se);
+
while (sd) {
struct sched_group *group;
struct sched_domain *tmp;
@@ -6224,7 +6278,7 @@ void __update_idle_core(struct rq *rq)
if (cpu == core)
continue;
- if (!idle_cpu(cpu))
+ if (!available_idle_cpu(cpu))
goto unlock;
}
@@ -6256,7 +6310,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
for_each_cpu(cpu, cpu_smt_mask(core)) {
cpumask_clear_cpu(cpu, cpus);
- if (!idle_cpu(cpu))
+ if (!available_idle_cpu(cpu))
idle = false;
}
@@ -6285,7 +6339,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
for_each_cpu(cpu, cpu_smt_mask(target)) {
if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
continue;
- if (idle_cpu(cpu))
+ if (available_idle_cpu(cpu))
return cpu;
}
@@ -6348,7 +6402,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
return -1;
if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
continue;
- if (idle_cpu(cpu))
+ if (available_idle_cpu(cpu))
break;
}
@@ -6368,13 +6422,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
struct sched_domain *sd;
int i, recent_used_cpu;
- if (idle_cpu(target))
+ if (available_idle_cpu(target))
return target;
/*
* If the previous CPU is cache affine and idle, don't be stupid:
*/
- if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+ if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
return prev;
/* Check a recently used CPU as a potential idle candidate: */
@@ -6382,7 +6436,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if (recent_used_cpu != prev &&
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
- idle_cpu(recent_used_cpu) &&
+ available_idle_cpu(recent_used_cpu) &&
cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
/*
* Replace recent_used_cpu with prev as it is a potential
@@ -6558,7 +6612,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
static int
select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
{
- struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
+ struct sched_domain *tmp, *sd = NULL;
int cpu = smp_processor_id();
int new_cpu = prev_cpu;
int want_affine = 0;
@@ -6581,7 +6635,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
*/
if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
- affine_sd = tmp;
+ if (cpu != prev_cpu)
+ new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
+
+ sd = NULL; /* Prefer wake_affine over balance flags */
break;
}
@@ -6591,33 +6648,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
break;
}
- if (affine_sd) {
- sd = NULL; /* Prefer wake_affine over balance flags */
- if (cpu == prev_cpu)
- goto pick_cpu;
-
- new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync);
- }
-
- if (sd && !(sd_flag & SD_BALANCE_FORK)) {
- /*
- * We're going to need the task's util for capacity_spare_wake
- * in find_idlest_group. Sync it up to prev_cpu's
- * last_update_time.
- */
- sync_entity_load_avg(&p->se);
- }
+ if (unlikely(sd)) {
+ /* Slow path */
+ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
+ } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
+ /* Fast path */
- if (!sd) {
-pick_cpu:
- if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
- new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
+ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
- if (want_affine)
- current->recent_used_cpu = cpu;
- }
- } else {
- new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
+ if (want_affine)
+ current->recent_used_cpu = cpu;
}
rcu_read_unlock();
@@ -10174,10 +10214,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
struct cfs_rq *cfs_rq;
int i;
- tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
+ tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
if (!tg->cfs_rq)
goto err;
- tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
+ tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
if (!tg->se)
goto err;