From a43455a1d572daf7b730fe12eb747d1e17411365 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 4 Jun 2014 16:09:42 -0400 Subject: sched/numa: Ensure task_numa_migrate() checks the preferred node The first thing task_numa_migrate() does is check to see if there is CPU capacity available on the preferred node, in order to move the task there. However, if the preferred node is all busy, we would skip considering that node for tasks swaps in the subsequent loop. This prevents NUMA convergence of tasks on busy systems. However, swapping locations with a task on our preferred nid, when the preferred nid is busy, is perfectly fine. The fix is to also look for a CPU on our preferred nid when it is totally busy. This changes "perf bench numa mem -p 4 -t 20 -m -0 -P 1000" from not converging in 15 minutes on my 4 node system, to converging in 10-20 seconds. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: mgorman@suse.de Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140604160942.6969b101@cuia.bos.redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fea7d3335e1f..8fbb0116bb5a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1302,9 +1302,8 @@ static int task_numa_migrate(struct task_struct *p) groupimp = group_weight(p, env.dst_nid) - groupweight; update_numa_stats(&env.dst_stats, env.dst_nid); - /* If the preferred nid has free capacity, try to use it. */ - if (env.dst_stats.has_free_capacity) - task_numa_find_cpu(&env, taskimp, groupimp); + /* Try to find a spot on the preferred nid. */ + task_numa_find_cpu(&env, taskimp, groupimp); /* No space available on the preferred nid. Look elsewhere. */ if (env.best_cpu == -1) { -- cgit v1.2.3 From bb97fc31647539f1f102eed646a95e200160a150 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 4 Jun 2014 16:33:15 -0400 Subject: sched/numa: Always try to migrate to preferred node at task_numa_placement() time It is possible that at task_numa_placement() time, the task's numa_preferred_nid does not change, but the task is not actually running on the preferred node at the time. In that case, we still want to attempt migration to the preferred node. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: mgorman@suse.de Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140604163315.1dbc7b56@cuia.bos.redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8fbb0116bb5a..3fa3e1839c86 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1613,11 +1613,13 @@ static void task_numa_placement(struct task_struct *p) spin_unlock_irq(group_lock); } - /* Preferred node as the node with the most faults */ - if (max_faults && max_nid != p->numa_preferred_nid) { - /* Update the preferred nid and migrate task if possible */ - sched_setnuma(p, max_nid); - numa_migrate_preferred(p); + if (max_faults) { + /* Set the new preferred node */ + if (max_nid != p->numa_preferred_nid) + sched_setnuma(p, max_nid); + + if (task_node(p) != p->numa_preferred_nid) + numa_migrate_preferred(p); } } -- cgit v1.2.3 From 5d5e2b1bcbdc996e72815c03fdc5ea82c4642397 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Tue, 10 Jun 2014 10:58:43 +0200 Subject: sched: Fix CACHE_HOT_BUDY condition When computing cache hot, we should check if the migration dst cpu is idle, instead of the current cpu. Though they are same in normal balancing, that is false nowadays in nohz idle balancing at least. Signed-off-by: Hillf Danton Signed-off-by: Peter Zijlstra Cc: Hillf Danton Cc: Mike Galbraith Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140607090452.4696E301D2@webmail.sinamail.sina.com.cn Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3fa3e1839c86..1f9c4571615d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5095,8 +5095,7 @@ static void move_task(struct task_struct *p, struct lb_env *env) /* * Is this task likely cache-hot: */ -static int -task_hot(struct task_struct *p, u64 now) +static int task_hot(struct task_struct *p, struct lb_env *env) { s64 delta; @@ -5109,7 +5108,7 @@ task_hot(struct task_struct *p, u64 now) /* * Buddy candidates are cache hot: */ - if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && + if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running && (&p->se == cfs_rq_of(&p->se)->next || &p->se == cfs_rq_of(&p->se)->last)) return 1; @@ -5119,7 +5118,7 @@ task_hot(struct task_struct *p, u64 now) if (sysctl_sched_migration_cost == 0) return 0; - delta = now - p->se.exec_start; + delta = rq_clock_task(env->src_rq) - p->se.exec_start; return delta < (s64)sysctl_sched_migration_cost; } @@ -5273,7 +5272,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) task is cache cold, or * 3) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq)); + tsk_cache_hot = task_hot(p, env); if (!tsk_cache_hot) tsk_cache_hot = migrate_degrades_locality(p, env); -- cgit v1.2.3 From c06f04c70489b9deea3212af8375e2f0c2f0b184 Mon Sep 17 00:00:00 2001 From: Ben Segall Date: Fri, 20 Jun 2014 15:21:20 -0700 Subject: sched: Fix potential near-infinite distribute_cfs_runtime() loop distribute_cfs_runtime() intentionally only hands out enough runtime to bring each cfs_rq to 1 ns of runtime, expecting the cfs_rqs to then take the runtime they need only once they actually get to run. However, if they get to run sufficiently quickly, the period timer is still in distribute_cfs_runtime() and no runtime is available, causing them to throttle. Then distribute has to handle them again, and this can go on until distribute has handed out all of the runtime 1ns at a time, which takes far too long. Instead allow access to the same runtime that distribute is handing out, accepting that corner cases with very low quota may be able to spend the entire cfs_b->runtime during distribute_cfs_runtime, meaning that the runtime directly handed out by distribute_cfs_runtime was over quota. In addition, if a cfs_rq does manage to throttle like this, make sure the existing distribute_cfs_runtime no longer loops over it again. Signed-off-by: Ben Segall Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140620222120.13814.21652.stgit@sword-of-the-dawn.mtv.corp.google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1f9c4571615d..ef5eac773c70 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3361,7 +3361,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); raw_spin_lock(&cfs_b->lock); - list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); + /* + * Add to the _head_ of the list, so that an already-started + * distribute_cfs_runtime will not see us + */ + list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); if (!cfs_b->timer_active) __start_cfs_bandwidth(cfs_b, false); raw_spin_unlock(&cfs_b->lock); @@ -3418,7 +3422,8 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining, u64 expires) { struct cfs_rq *cfs_rq; - u64 runtime = remaining; + u64 runtime; + u64 starting_runtime = remaining; rcu_read_lock(); list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, @@ -3449,7 +3454,7 @@ next: } rcu_read_unlock(); - return remaining; + return starting_runtime - remaining; } /* @@ -3495,22 +3500,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) /* account preceding periods in which throttling occurred */ cfs_b->nr_throttled += overrun; - /* - * There are throttled entities so we must first use the new bandwidth - * to unthrottle them before making it generally available. This - * ensures that all existing debts will be paid before a new cfs_rq is - * allowed to run. - */ - runtime = cfs_b->runtime; runtime_expires = cfs_b->runtime_expires; - cfs_b->runtime = 0; /* - * This check is repeated as we are holding onto the new bandwidth - * while we unthrottle. This can potentially race with an unthrottled - * group trying to acquire new bandwidth from the global pool. + * This check is repeated as we are holding onto the new bandwidth while + * we unthrottle. This can potentially race with an unthrottled group + * trying to acquire new bandwidth from the global pool. This can result + * in us over-using our runtime if it is all used during this loop, but + * only by limited amounts in that extreme case. */ - while (throttled && runtime > 0) { + while (throttled && cfs_b->runtime > 0) { + runtime = cfs_b->runtime; raw_spin_unlock(&cfs_b->lock); /* we can't nest cfs_b->lock while distributing bandwidth */ runtime = distribute_cfs_runtime(cfs_b, runtime, @@ -3518,10 +3518,10 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) raw_spin_lock(&cfs_b->lock); throttled = !list_empty(&cfs_b->throttled_cfs_rq); + + cfs_b->runtime -= min(runtime, cfs_b->runtime); } - /* return (any) remaining runtime */ - cfs_b->runtime = runtime; /* * While we are ensured activity in the period following an * unthrottle, this also covers the case in which the new bandwidth is @@ -3632,10 +3632,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) return; } - if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { + if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) runtime = cfs_b->runtime; - cfs_b->runtime = 0; - } + expires = cfs_b->runtime_expires; raw_spin_unlock(&cfs_b->lock); @@ -3646,7 +3645,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) raw_spin_lock(&cfs_b->lock); if (expires == cfs_b->runtime_expires) - cfs_b->runtime = runtime; + cfs_b->runtime -= min(runtime, cfs_b->runtime); raw_spin_unlock(&cfs_b->lock); } -- cgit v1.2.3 From 4486edd12b5ac8a9af7a5e16e4b9eeb3b8339c10 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Mon, 23 Jun 2014 12:16:49 -0700 Subject: sched/fair: Implement fast idling of CPUs when the system is partially loaded When a system is lightly loaded (i.e. no more than 1 job per cpu), attempt to pull job to a cpu before putting it to idle is unnecessary and can be skipped. This patch adds an indicator so the scheduler can know when there's no more than 1 active job is on any CPU in the system to skip needless job pulls. On a 4 socket machine with a request/response kind of workload from clients, we saw about 0.13 msec delay when we go through a full load balance to try pull job from all the other cpus. While 0.1 msec was spent on processing the request and generating a response, the 0.13 msec load balance overhead was actually more than the actual work being done. This overhead can be skipped much of the time for lightly loaded systems. With this patch, we tested with a netperf request/response workload that has the server busy with half the cpus in a 4 socket system. We found the patch eliminated 75% of the load balance attempts before idling a cpu. The overhead of setting/clearing the indicator is low as we already gather the necessary info while we call add_nr_running() and update_sd_lb_stats.() We switch to full load balance load immediately if any cpu got more than one job on its run queue in add_nr_running. We'll clear the indicator to avoid load balance when we detect no cpu's have more than one job when we scan the work queues in update_sg_lb_stats(). We are aggressive in turning on the load balance and opportunistic in skipping the load balance. Signed-off-by: Tim Chen Acked-by: Rik van Riel Acked-by: Jason Low Cc: "Paul E.McKenney" Cc: Andrew Morton Cc: Davidlohr Bueso Cc: Alex Shi Cc: Michel Lespinasse Cc: Peter Hurley Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1403551009.2970.613.camel@schen9-DESK Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 21 ++++++++++++++++++--- kernel/sched/sched.h | 12 ++++++++++-- 2 files changed, 28 insertions(+), 5 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ef5eac773c70..e3ff3d1c4780 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5866,7 +5866,8 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, - int local_group, struct sg_lb_stats *sgs) + int local_group, struct sg_lb_stats *sgs, + bool *overload) { unsigned long load; int i; @@ -5884,6 +5885,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_load += load; sgs->sum_nr_running += rq->nr_running; + + if (rq->nr_running > 1) + *overload = true; + #ifdef CONFIG_NUMA_BALANCING sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; @@ -5994,6 +5999,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; + bool overload = false; if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; @@ -6014,7 +6020,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_group_capacity(env->sd, env->dst_cpu); } - update_sg_lb_stats(env, sg, load_idx, local_group, sgs); + update_sg_lb_stats(env, sg, load_idx, local_group, sgs, + &overload); if (local_group) goto next_group; @@ -6048,6 +6055,13 @@ next_group: if (env->sd->flags & SD_NUMA) env->fbq_type = fbq_classify_group(&sds->busiest_stat); + + if (!env->sd->parent) { + /* update overload indicator if we are at root domain */ + if (env->dst_rq->rd->overload != overload) + env->dst_rq->rd->overload = overload; + } + } /** @@ -6766,7 +6780,8 @@ static int idle_balance(struct rq *this_rq) */ this_rq->idle_stamp = rq_clock(this_rq); - if (this_rq->avg_idle < sysctl_sched_migration_cost) { + if (this_rq->avg_idle < sysctl_sched_migration_cost || + !this_rq->rd->overload) { rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index eb8567610295..0191ed563bdd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -477,6 +477,9 @@ struct root_domain { cpumask_var_t span; cpumask_var_t online; + /* Indicate more than one runnable task for any CPU */ + bool overload; + /* * The bit corresponding to a CPU gets set here if such CPU has more * than one runnable -deadline task (as it is below for RT tasks). @@ -1218,8 +1221,13 @@ static inline void add_nr_running(struct rq *rq, unsigned count) rq->nr_running = prev_nr + count; -#ifdef CONFIG_NO_HZ_FULL if (prev_nr < 2 && rq->nr_running >= 2) { +#ifdef CONFIG_SMP + if (!rq->rd->overload) + rq->rd->overload = true; +#endif + +#ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_cpu(rq->cpu)) { /* * Tick is needed if more than one task runs on a CPU. @@ -1231,8 +1239,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count) */ tick_nohz_full_kick_cpu(rq->cpu); } - } #endif + } } static inline void sub_nr_running(struct rq *rq, unsigned count) -- cgit v1.2.3 From f0b8a4afd6a8c500161e45065a91738b490bf5ae Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 23 Jun 2014 11:41:29 -0400 Subject: sched/numa: Use group's max nid as task's preferred nid From task_numa_placement, always try to consolidate the tasks in a group on the group's top nid. In case this task is part of a group that is interleaved over multiple nodes, task_numa_migrate will set the task's preferred nid to the best node it could find for the task, so this patch will cause at most one run through task_numa_migrate. Signed-off-by: Rik van Riel Cc: mgorman@suse.de Cc: chegu_vinod@hp.com Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1403538095-31256-2-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e3ff3d1c4780..96b2d3929a4e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1594,23 +1594,8 @@ static void task_numa_placement(struct task_struct *p) if (p->numa_group) { update_numa_active_node_mask(p->numa_group); - /* - * If the preferred task and group nids are different, - * iterate over the nodes again to find the best place. - */ - if (max_nid != max_group_nid) { - unsigned long weight, max_weight = 0; - - for_each_online_node(nid) { - weight = task_weight(p, nid) + group_weight(p, nid); - if (weight > max_weight) { - max_weight = weight; - max_nid = nid; - } - } - } - spin_unlock_irq(group_lock); + max_nid = max_group_nid; } if (max_faults) { -- cgit v1.2.3 From 28a21745190a0ca613cab817bfe3dc65373158bf Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 23 Jun 2014 11:46:13 -0400 Subject: sched/numa: Move power adjustment into load_too_imbalanced() Currently the NUMA code scales the load on each node with the amount of CPU power available on that node, but it does not apply any adjustment to the load of the task that is being moved over. On systems with SMT/HT, this results in a task being weighed much more heavily than a CPU core, and a task move that would even out the load between nodes being disallowed. The correct thing is to apply the power correction to the numbers after we have first applied the move of the tasks' loads to them. This also allows us to do the power correction with a multiplication, rather than a division. Also drop two function arguments for load_too_unbalanced, since it takes various factors from env already. Signed-off-by: Rik van Riel Cc: chegu_vinod@hp.com Cc: mgorman@suse.de Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1403538378-31571-2-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 96b2d3929a4e..f287d0b4007a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1062,7 +1062,6 @@ static void update_numa_stats(struct numa_stats *ns, int nid) if (!cpus) return; - ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity; ns->task_capacity = DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); ns->has_free_capacity = (ns->nr_running < ns->task_capacity); @@ -1096,18 +1095,30 @@ static void task_numa_assign(struct task_numa_env *env, env->best_cpu = env->dst_cpu; } -static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, - long src_load, long dst_load, +static bool load_too_imbalanced(long src_load, long dst_load, struct task_numa_env *env) { long imb, old_imb; + long orig_src_load, orig_dst_load; + long src_capacity, dst_capacity; + + /* + * The load is corrected for the CPU capacity available on each node. + * + * src_load dst_load + * ------------ vs --------- + * src_capacity dst_capacity + */ + src_capacity = env->src_stats.compute_capacity; + dst_capacity = env->dst_stats.compute_capacity; /* We care about the slope of the imbalance, not the direction. */ if (dst_load < src_load) swap(dst_load, src_load); /* Is the difference below the threshold? */ - imb = dst_load * 100 - src_load * env->imbalance_pct; + imb = dst_load * src_capacity * 100 - + src_load * dst_capacity * env->imbalance_pct; if (imb <= 0) return false; @@ -1115,10 +1126,14 @@ static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, * The imbalance is above the allowed threshold. * Compare it with the old imbalance. */ + orig_src_load = env->src_stats.load; + orig_dst_load = env->dst_stats.load; + if (orig_dst_load < orig_src_load) swap(orig_dst_load, orig_src_load); - old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; + old_imb = orig_dst_load * src_capacity * 100 - + orig_src_load * dst_capacity * env->imbalance_pct; /* Would this change make things worse? */ return (imb > old_imb); @@ -1136,8 +1151,7 @@ static void task_numa_compare(struct task_numa_env *env, struct rq *src_rq = cpu_rq(env->src_cpu); struct rq *dst_rq = cpu_rq(env->dst_cpu); struct task_struct *cur; - long orig_src_load, src_load; - long orig_dst_load, dst_load; + long src_load, dst_load; long load; long imp = (groupimp > 0) ? groupimp : taskimp; @@ -1211,13 +1225,9 @@ static void task_numa_compare(struct task_numa_env *env, * In the overloaded case, try and keep the load balanced. */ balance: - orig_dst_load = env->dst_stats.load; - orig_src_load = env->src_stats.load; - - /* XXX missing capacity terms */ load = task_h_load(env->p); - dst_load = orig_dst_load + load; - src_load = orig_src_load - load; + dst_load = env->dst_stats.load + load; + src_load = env->src_stats.load - load; if (cur) { load = task_h_load(cur); @@ -1225,8 +1235,7 @@ balance: src_load += load; } - if (load_too_imbalanced(orig_src_load, orig_dst_load, - src_load, dst_load, env)) + if (load_too_imbalanced(src_load, dst_load, env)) goto unlock; assign: -- cgit v1.2.3 From 6dc1a672ab15604947361dcd02e459effa09bad5 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 23 Jun 2014 11:46:14 -0400 Subject: sched/numa: Use effective_load() to balance NUMA loads When CONFIG_FAIR_GROUP_SCHED is enabled, the load that a task places on a CPU is determined by the group the task is in. The active groups on the source and destination CPU can be different, resulting in a different load contribution by the same task at its source and at its destination. As a result, the load needs to be calculated separately for each CPU, instead of estimated once with task_h_load(). Getting this calculation right allows some workloads to converge, where previously the last thread could get stuck on another node, without being able to migrate to its final destination. Signed-off-by: Rik van Riel Cc: mgorman@suse.de Cc: chegu_vinod@hp.com Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1403538378-31571-3-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f287d0b4007a..d6526d2cf173 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1151,6 +1151,7 @@ static void task_numa_compare(struct task_numa_env *env, struct rq *src_rq = cpu_rq(env->src_cpu); struct rq *dst_rq = cpu_rq(env->dst_cpu); struct task_struct *cur; + struct task_group *tg; long src_load, dst_load; long load; long imp = (groupimp > 0) ? groupimp : taskimp; @@ -1225,14 +1226,21 @@ static void task_numa_compare(struct task_numa_env *env, * In the overloaded case, try and keep the load balanced. */ balance: - load = task_h_load(env->p); - dst_load = env->dst_stats.load + load; - src_load = env->src_stats.load - load; + src_load = env->src_stats.load; + dst_load = env->dst_stats.load; + + /* Calculate the effect of moving env->p from src to dst. */ + load = env->p->se.load.weight; + tg = task_group(env->p); + src_load += effective_load(tg, env->src_cpu, -load, -load); + dst_load += effective_load(tg, env->dst_cpu, load, load); if (cur) { - load = task_h_load(cur); - dst_load -= load; - src_load += load; + /* Cur moves in the opposite direction. */ + load = cur->se.load.weight; + tg = task_group(cur); + src_load += effective_load(tg, env->src_cpu, load, load); + dst_load += effective_load(tg, env->dst_cpu, -load, -load); } if (load_too_imbalanced(src_load, dst_load, env)) -- cgit v1.2.3 From 1c5d3eb3759013bc7ee4197aa0a9f245bdb6eb90 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 23 Jun 2014 11:46:15 -0400 Subject: sched/numa: Simplify task_numa_compare() When a task is part of a numa_group, the comparison should always use the group weight, in order to make workloads converge. Signed-off-by: Rik van Riel Cc: chegu_vinod@hp.com Cc: mgorman@suse.de Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1403538378-31571-4-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d6526d2cf173..cebb312e874b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1154,7 +1154,7 @@ static void task_numa_compare(struct task_numa_env *env, struct task_group *tg; long src_load, dst_load; long load; - long imp = (groupimp > 0) ? groupimp : taskimp; + long imp = env->p->numa_group ? groupimp : taskimp; rcu_read_lock(); cur = ACCESS_ONCE(dst_rq->curr); @@ -1192,11 +1192,6 @@ static void task_numa_compare(struct task_numa_env *env, * itself (not part of a group), use the task weight * instead. */ - if (env->p->numa_group) - imp = groupimp; - else - imp = taskimp; - if (cur->numa_group) imp += group_weight(cur, env->src_nid) - group_weight(cur, env->dst_nid); -- cgit v1.2.3 From 0132c3e1777ceabc24c7d209b7cbe78c28c03c09 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 23 Jun 2014 11:46:16 -0400 Subject: sched/numa: Examine a task move when examining a task swap Running "perf bench numa mem -0 -m -P 1000 -p 8 -t 20" on a 4 node system results in 160 runnable threads on a system with 80 CPU threads. Once a process has nearly converged, with 39 threads on one node and 1 thread on another node, the remaining thread will be unable to migrate to its preferred node through a task swap. However, a simple task move would make the workload converge, witout causing an imbalance. Test for this unlikely occurrence, and attempt a task move to the preferred nid when it happens. # Running main, "perf bench numa mem -p 8 -t 20 -0 -m -P 1000" ### # 160 tasks will execute (on 4 nodes, 80 CPUs): # -1x 0MB global shared mem operations # -1x 1000MB process shared mem operations # -1x 0MB thread local mem operations ### ### # # 0.0% [0.2 mins] 0/0 1/1 36/2 0/0 [36/3 ] l: 0-0 ( 0) {0-2} # 0.0% [0.3 mins] 43/3 37/2 39/2 41/3 [ 6/10] l: 0-1 ( 1) {1-2} # 0.0% [0.4 mins] 42/3 38/2 40/2 40/2 [ 4/9 ] l: 1-2 ( 1) [50.0%] {1-2} # 0.0% [0.6 mins] 41/3 39/2 40/2 40/2 [ 2/9 ] l: 2-4 ( 2) [50.0%] {1-2} # 0.0% [0.7 mins] 40/2 40/2 40/2 40/2 [ 0/8 ] l: 3-5 ( 2) [40.0%] ( 41.8s converged) Without this patch, this same perf bench numa mem run had to rely on the scheduler load balancer to first balance out the load (moving a random task), before a task swap could complete the NUMA convergence. The load balancer does not normally take action unless the load difference exceeds 25%. Convergence times of over half an hour have been observed without this patch. With this patch, the NUMA balancing code will simply migrate the task, if that does not cause an imbalance. Also skip examining a CPU in detail if the improvement on that CPU is no more than the best we already have. Signed-off-by: Rik van Riel Cc: chegu_vinod@hp.com Cc: mgorman@suse.de Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-ggthh0rnh0yua6o5o3p6cr1o@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cebb312e874b..9d1734a724a8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1155,6 +1155,7 @@ static void task_numa_compare(struct task_numa_env *env, long src_load, dst_load; long load; long imp = env->p->numa_group ? groupimp : taskimp; + long moveimp = imp; rcu_read_lock(); cur = ACCESS_ONCE(dst_rq->curr); @@ -1201,7 +1202,7 @@ static void task_numa_compare(struct task_numa_env *env, } } - if (imp < env->best_imp) + if (imp <= env->best_imp && moveimp <= env->best_imp) goto unlock; if (!cur) { @@ -1214,7 +1215,8 @@ static void task_numa_compare(struct task_numa_env *env, } /* Balance doesn't matter much if we're running a task per cpu */ - if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) + if (imp > env->best_imp && src_rq->nr_running == 1 && + dst_rq->nr_running == 1) goto assign; /* @@ -1230,6 +1232,23 @@ balance: src_load += effective_load(tg, env->src_cpu, -load, -load); dst_load += effective_load(tg, env->dst_cpu, load, load); + if (moveimp > imp && moveimp > env->best_imp) { + /* + * If the improvement from just moving env->p direction is + * better than swapping tasks around, check if a move is + * possible. Store a slightly smaller score than moveimp, + * so an actually idle CPU will win. + */ + if (!load_too_imbalanced(src_load, dst_load, env)) { + imp = moveimp - 1; + cur = NULL; + goto assign; + } + } + + if (imp <= env->best_imp) + goto unlock; + if (cur) { /* Cur moves in the opposite direction. */ load = cur->se.load.weight; -- cgit v1.2.3 From db015daedb56251b73f956f70b3b8813f80d8ee1 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 23 Jun 2014 11:41:34 -0400 Subject: sched/numa: Rework best node setting in task_numa_migrate() Fix up the best node setting in task_numa_migrate() to deal with a task in a pseudo-interleaved NUMA group, which is already running in the best location. Set the task's preferred nid to the current nid, so task migration is not retried at a high rate. Signed-off-by: Rik van Riel Cc: mgorman@suse.de Cc: chegu_vinod@hp.com Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1403538095-31256-7-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9d1734a724a8..7bb2f464b456 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1354,10 +1354,6 @@ static int task_numa_migrate(struct task_struct *p) } } - /* No better CPU than the current one was found. */ - if (env.best_cpu == -1) - return -EAGAIN; - /* * If the task is part of a workload that spans multiple NUMA nodes, * and is migrating into one of the workload's active nodes, remember @@ -1366,8 +1362,19 @@ static int task_numa_migrate(struct task_struct *p) * A task that migrated to a second choice node will be better off * trying for a better one later. Do not set the preferred node here. */ - if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) - sched_setnuma(p, env.dst_nid); + if (p->numa_group) { + if (env.best_cpu == -1) + nid = env.src_nid; + else + nid = env.dst_nid; + + if (node_isset(nid, p->numa_group->active_nodes)) + sched_setnuma(p, env.dst_nid); + } + + /* No better CPU than the current one was found. */ + if (env.best_cpu == -1) + return -EAGAIN; /* * Reset the scan period if the task is being rescheduled on an -- cgit v1.2.3 From a22b4b012340b988dbe7a58461d6fcc582f34aa0 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 23 Jun 2014 11:41:35 -0400 Subject: sched/numa: Change scan period code to match intent Reading through the scan period code and comment, it appears the intent was to slow down NUMA scanning when a majority of accesses are on the local node, specifically a local:remote ratio of 3:1. However, the code actually tests local / (local + remote), and the actual cut-off point was around 30% local accesses, well before a task has actually converged on a node. Changing the threshold to 7 means scanning slows down when a task has around 70% of its accesses local, which appears to match the intent of the code more closely. Signed-off-by: Rik van Riel Cc: mgorman@suse.de Cc: chegu_vinod@hp.com Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1403538095-31256-8-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7bb2f464b456..a140c6a8c947 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1452,12 +1452,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group) /* * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS * increments. The more local the fault statistics are, the higher the scan - * period will be for the next scan window. If local/remote ratio is below - * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the - * scan period will decrease + * period will be for the next scan window. If local/(local+remote) ratio is + * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) + * the scan period will decrease. Aim for 70% local accesses. */ #define NUMA_PERIOD_SLOTS 10 -#define NUMA_PERIOD_THRESHOLD 3 +#define NUMA_PERIOD_THRESHOLD 7 /* * Increase the scan period (slow down scanning) if the majority of -- cgit v1.2.3 From 0e59bdaea75f12a7d7c03672f4ac22c0119a1bc0 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 25 Jun 2014 12:19:42 +0400 Subject: sched/fair: Disable runtime_enabled on dying rq We kill rq->rd on the CPU_DOWN_PREPARE stage: cpuset_cpu_inactive -> cpuset_update_active_cpus -> partition_sched_domains -> -> cpu_attach_domain -> rq_attach_root -> set_rq_offline This unthrottles all throttled cfs_rqs. But the cpu is still able to call schedule() till take_cpu_down->__cpu_disable() is called from stop_machine. This case the tasks from just unthrottled cfs_rqs are pickable in a standard scheduler way, and they are picked by dying cpu. The cfs_rqs becomes throttled again, and migrate_tasks() in migration_call skips their tasks (one more unthrottle in migrate_tasks()->CPU_DYING does not happen, because rq->rd is already NULL). Patch sets runtime_enabled to zero. This guarantees, the runtime is not accounted, and the cfs_rqs won't exceed given cfs_rq->runtime_remaining = 1, and tasks will be pickable in migrate_tasks(). runtime_enabled is recalculated again when rq becomes online again. Ben Segall also noticed, we always enable runtime in tg_set_cfs_bandwidth(). Actually, we should do that for online cpus only. To prevent races with unthrottle_offline_cfs_rqs() we take get_online_cpus() lock. Reviewed-by: Ben Segall Reviewed-by: Srikar Dronamraju Signed-off-by: Kirill Tkhai CC: Konstantin Khorenko CC: Paul Turner CC: Mike Galbraith Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1403684382.3462.42.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 8 +++++++- kernel/sched/fair.c | 22 ++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e50234ba0b27..2dbc63d1a847 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7817,6 +7817,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) if (period > max_cfs_quota_period) return -EINVAL; + /* + * Prevent race between setting of cfs_rq->runtime_enabled and + * unthrottle_offline_cfs_rqs(). + */ + get_online_cpus(); mutex_lock(&cfs_constraints_mutex); ret = __cfs_schedulable(tg, period, quota); if (ret) @@ -7842,7 +7847,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) } raw_spin_unlock_irq(&cfs_b->lock); - for_each_possible_cpu(i) { + for_each_online_cpu(i) { struct cfs_rq *cfs_rq = tg->cfs_rq[i]; struct rq *rq = cfs_rq->rq; @@ -7858,6 +7863,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) cfs_bandwidth_usage_dec(); out_unlock: mutex_unlock(&cfs_constraints_mutex); + put_online_cpus(); return ret; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a140c6a8c947..923fe32db6b3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3798,6 +3798,19 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) hrtimer_cancel(&cfs_b->slack_timer); } +static void __maybe_unused update_runtime_enabled(struct rq *rq) +{ + struct cfs_rq *cfs_rq; + + for_each_leaf_cfs_rq(rq, cfs_rq) { + struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; + + raw_spin_lock(&cfs_b->lock); + cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; + raw_spin_unlock(&cfs_b->lock); + } +} + static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) { struct cfs_rq *cfs_rq; @@ -3811,6 +3824,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) * there's some valid quota amount */ cfs_rq->runtime_remaining = 1; + /* + * Offline rq is schedulable till cpu is completely disabled + * in take_cpu_down(), so we prevent new cfs throttling here. + */ + cfs_rq->runtime_enabled = 0; + if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); } @@ -3854,6 +3873,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return NULL; } static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} +static inline void update_runtime_enabled(struct rq *rq) {} static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} #endif /* CONFIG_CFS_BANDWIDTH */ @@ -7362,6 +7382,8 @@ void trigger_load_balance(struct rq *rq) static void rq_online_fair(struct rq *rq) { update_sysctl(); + + update_runtime_enabled(rq); } static void rq_offline_fair(struct rq *rq) -- cgit v1.2.3 From 8875125efe8402c4d84b08291e68f1281baba8e2 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Sun, 29 Jun 2014 00:03:57 +0400 Subject: sched: Transform resched_task() into resched_curr() We always use resched_task() with rq->curr argument. It's not possible to reschedule any task but rq's current. The patch introduces resched_curr(struct rq *) to replace all of the repeating patterns. The main aim is cleanup, but there is a little size profit too: (before) $ size kernel/sched/built-in.o text data bss dec hex filename 155274 16445 7042 178761 2ba49 kernel/sched/built-in.o $ size vmlinux text data bss dec hex filename 7411490 1178376 991232 9581098 92322a vmlinux (after) $ size kernel/sched/built-in.o text data bss dec hex filename 155130 16445 7042 178617 2b9b9 kernel/sched/built-in.o $ size vmlinux text data bss dec hex filename 7411362 1178376 991232 9580970 9231aa vmlinux I was choosing between resched_curr() and resched_rq(), and the first name looks better for me. A little lie in Documentation/trace/ftrace.txt. I have not actually collected the tracing again. With a hope the patch won't make execution times much worse :) Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Randy Dunlap Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20140628200219.1778.18735.stgit@localhost Signed-off-by: Ingo Molnar --- Documentation/trace/ftrace.txt | 2 +- include/linux/sched.h | 6 +++--- kernel/sched/core.c | 25 +++++++++++++------------ kernel/sched/deadline.c | 16 ++++++++-------- kernel/sched/fair.c | 20 ++++++++++---------- kernel/sched/idle_task.c | 2 +- kernel/sched/rt.c | 27 ++++++++++++++------------- kernel/sched/sched.h | 2 +- 8 files changed, 51 insertions(+), 49 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index 2479b2a0c77c..4da42616939f 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -1515,7 +1515,7 @@ Doing the same with chrt -r 5 and function-trace set. -0 3d.h4 1us+: 0:120:R + [003] 2448: 94:R sleep -0 3d.h4 2us : ttwu_do_activate.constprop.87 <-try_to_wake_up -0 3d.h3 3us : check_preempt_curr <-ttwu_do_wakeup - -0 3d.h3 3us : resched_task <-check_preempt_curr + -0 3d.h3 3us : resched_curr <-check_preempt_curr -0 3dNh3 4us : task_woken_rt <-ttwu_do_wakeup -0 3dNh3 4us : _raw_spin_unlock <-try_to_wake_up -0 3dNh3 4us : sub_preempt_count <-_raw_spin_unlock diff --git a/include/linux/sched.h b/include/linux/sched.h index c9c9ff723525..41a195385081 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2786,7 +2786,7 @@ static inline bool __must_check current_set_polling_and_test(void) /* * Polling state must be visible before we test NEED_RESCHED, - * paired by resched_task() + * paired by resched_curr() */ smp_mb__after_atomic(); @@ -2804,7 +2804,7 @@ static inline bool __must_check current_clr_polling_and_test(void) /* * Polling state must be visible before we test NEED_RESCHED, - * paired by resched_task() + * paired by resched_curr() */ smp_mb__after_atomic(); @@ -2836,7 +2836,7 @@ static inline void current_clr_polling(void) * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also * fold. */ - smp_mb(); /* paired with resched_task() */ + smp_mb(); /* paired with resched_curr() */ preempt_fold_need_resched(); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cf7695a6c1d2..2f960813c582 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -589,30 +589,31 @@ static bool set_nr_if_polling(struct task_struct *p) #endif /* - * resched_task - mark a task 'to be rescheduled now'. + * resched_curr - mark rq's current task 'to be rescheduled now'. * * On UP this means the setting of the need_resched flag, on SMP it * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ -void resched_task(struct task_struct *p) +void resched_curr(struct rq *rq) { + struct task_struct *curr = rq->curr; int cpu; - lockdep_assert_held(&task_rq(p)->lock); + lockdep_assert_held(&rq->lock); - if (test_tsk_need_resched(p)) + if (test_tsk_need_resched(curr)) return; - cpu = task_cpu(p); + cpu = cpu_of(rq); if (cpu == smp_processor_id()) { - set_tsk_need_resched(p); + set_tsk_need_resched(curr); set_preempt_need_resched(); return; } - if (set_nr_and_not_polling(p)) + if (set_nr_and_not_polling(curr)) smp_send_reschedule(cpu); else trace_sched_wake_idle_without_ipi(cpu); @@ -625,7 +626,7 @@ void resched_cpu(int cpu) if (!raw_spin_trylock_irqsave(&rq->lock, flags)) return; - resched_task(cpu_curr(cpu)); + resched_curr(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -1027,7 +1028,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) if (class == rq->curr->sched_class) break; if (class == p->sched_class) { - resched_task(rq->curr); + resched_curr(rq); break; } } @@ -3073,7 +3074,7 @@ void set_user_nice(struct task_struct *p, long nice) * lowered its priority, then reschedule its CPU: */ if (delta < 0 || (delta > 0 && task_running(rq, p))) - resched_task(rq->curr); + resched_curr(rq); } out_unlock: task_rq_unlock(rq, p, &flags); @@ -4299,7 +4300,7 @@ again: * fairness. */ if (preempt && rq != p_rq) - resched_task(p_rq->curr); + resched_curr(p_rq); } out_unlock: @@ -7106,7 +7107,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p) __setscheduler(rq, p, &attr); if (on_rq) { enqueue_task(rq, p, 0); - resched_task(rq->curr); + resched_curr(rq); } check_class_changed(rq, p, prev_class, old_prio); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fc4f98b1258f..df0b77a8caca 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -535,7 +535,7 @@ again: if (task_has_dl_policy(rq->curr)) check_preempt_curr_dl(rq, p, 0); else - resched_task(rq->curr); + resched_curr(rq); #ifdef CONFIG_SMP /* * Queueing this task back might have overloaded rq, @@ -634,7 +634,7 @@ static void update_curr_dl(struct rq *rq) enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); if (!is_leftmost(curr, &rq->dl)) - resched_task(curr); + resched_curr(rq); } /* @@ -964,7 +964,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) cpudl_find(&rq->rd->cpudl, p, NULL) != -1) return; - resched_task(rq->curr); + resched_curr(rq); } static int pull_dl_task(struct rq *this_rq); @@ -979,7 +979,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags) { if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { - resched_task(rq->curr); + resched_curr(rq); return; } @@ -1333,7 +1333,7 @@ retry: if (dl_task(rq->curr) && dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && rq->curr->nr_cpus_allowed > 1) { - resched_task(rq->curr); + resched_curr(rq); return 0; } @@ -1373,7 +1373,7 @@ retry: set_task_cpu(next_task, later_rq->cpu); activate_task(later_rq, next_task, 0); - resched_task(later_rq->curr); + resched_curr(later_rq); double_unlock_balance(rq, later_rq); @@ -1632,14 +1632,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, */ if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) && rq->curr == p) - resched_task(p); + resched_curr(rq); #else /* * Again, we don't know if p has a earlier * or later deadline, so let's blindly set a * (maybe not needed) rescheduling point. */ - resched_task(p); + resched_curr(rq); #endif /* CONFIG_SMP */ } else switched_to_dl(rq, p); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 923fe32db6b3..f5f0cc91518c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2923,7 +2923,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; if (delta_exec > ideal_runtime) { - resched_task(rq_of(cfs_rq)->curr); + resched_curr(rq_of(cfs_rq)); /* * The current task ran long enough, ensure it doesn't get * re-elected due to buddy favours. @@ -2947,7 +2947,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) return; if (delta > ideal_runtime) - resched_task(rq_of(cfs_rq)->curr); + resched_curr(rq_of(cfs_rq)); } static void @@ -3087,7 +3087,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * validating it and just reschedule. */ if (queued) { - resched_task(rq_of(cfs_rq)->curr); + resched_curr(rq_of(cfs_rq)); return; } /* @@ -3278,7 +3278,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) * hierarchy can be throttled */ if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) - resched_task(rq_of(cfs_rq)->curr); + resched_curr(rq_of(cfs_rq)); } static __always_inline @@ -3438,7 +3438,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) /* determine whether we need to wake up potentially idle cpu */ if (rq->curr == rq->idle && rq->cfs.nr_running) - resched_task(rq->curr); + resched_curr(rq); } static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, @@ -3897,7 +3897,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) if (delta < 0) { if (rq->curr == p) - resched_task(p); + resched_curr(rq); return; } @@ -4766,7 +4766,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; preempt: - resched_task(curr); + resched_curr(rq); /* * Only set the backward buddy when the current task is still * on the rq. This can happen when a wakeup gets interleaved @@ -7457,7 +7457,7 @@ static void task_fork_fair(struct task_struct *p) * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); - resched_task(rq->curr); + resched_curr(rq); } se->vruntime -= cfs_rq->min_vruntime; @@ -7482,7 +7482,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) */ if (rq->curr == p) { if (p->prio > oldprio) - resched_task(rq->curr); + resched_curr(rq); } else check_preempt_curr(rq, p, 0); } @@ -7545,7 +7545,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) * if we can still preempt the current task. */ if (rq->curr == p) - resched_task(rq->curr); + resched_curr(rq); else check_preempt_curr(rq, p, 0); } diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 879f2b75266a..67ad4e7f506a 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -20,7 +20,7 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) */ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) { - resched_task(rq->idle); + resched_curr(rq); } static struct task_struct * diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 671a8b5fdb6f..5f6edca4fafd 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -463,9 +463,10 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; + struct rq *rq = rq_of_rt_rq(rt_rq); struct sched_rt_entity *rt_se; - int cpu = cpu_of(rq_of_rt_rq(rt_rq)); + int cpu = cpu_of(rq); rt_se = rt_rq->tg->rt_se[cpu]; @@ -476,7 +477,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) enqueue_rt_entity(rt_se, false); if (rt_rq->highest_prio.curr < curr->prio) - resched_task(curr); + resched_curr(rq); } } @@ -566,7 +567,7 @@ static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) return; enqueue_top_rt_rq(rt_rq); - resched_task(rq->curr); + resched_curr(rq); } static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) @@ -951,7 +952,7 @@ static void update_curr_rt(struct rq *rq) raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_time += delta_exec; if (sched_rt_runtime_exceeded(rt_rq)) - resched_task(curr); + resched_curr(rq); raw_spin_unlock(&rt_rq->rt_runtime_lock); } } @@ -1366,7 +1367,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * to try and push current away: */ requeue_task_rt(rq, p, 1); - resched_task(rq->curr); + resched_curr(rq); } #endif /* CONFIG_SMP */ @@ -1377,7 +1378,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) { if (p->prio < rq->curr->prio) { - resched_task(rq->curr); + resched_curr(rq); return; } @@ -1693,7 +1694,7 @@ retry: * just reschedule current. */ if (unlikely(next_task->prio < rq->curr->prio)) { - resched_task(rq->curr); + resched_curr(rq); return 0; } @@ -1740,7 +1741,7 @@ retry: activate_task(lowest_rq, next_task, 0); ret = 1; - resched_task(lowest_rq->curr); + resched_curr(lowest_rq); double_unlock_balance(rq, lowest_rq); @@ -1939,7 +1940,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) return; if (pull_rt_task(rq)) - resched_task(rq->curr); + resched_curr(rq); } void __init init_sched_rt_class(void) @@ -1977,7 +1978,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) check_resched = 0; #endif /* CONFIG_SMP */ if (check_resched && p->prio < rq->curr->prio) - resched_task(rq->curr); + resched_curr(rq); } } @@ -2006,11 +2007,11 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) * Only reschedule if p is still on the same runqueue. */ if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) - resched_task(p); + resched_curr(rq); #else /* For UP simply resched on drop of prio */ if (oldprio < p->prio) - resched_task(p); + resched_curr(rq); #endif /* CONFIG_SMP */ } else { /* @@ -2019,7 +2020,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) * then reschedule. */ if (p->prio < rq->curr->prio) - resched_task(rq->curr); + resched_curr(rq); } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0191ed563bdd..1283945d1ace 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1199,7 +1199,7 @@ extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); extern void init_sched_dl_class(void); -extern void resched_task(struct task_struct *p); +extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); extern struct rt_bandwidth def_rt_bandwidth; -- cgit v1.2.3 From e720fff6341fe4b95e5a93c939bd3c77fa55ced4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Jul 2014 16:01:53 +0200 Subject: sched/numa: Revert "Use effective_load() to balance NUMA loads" Due to divergent trees, Rik find that this patch is no longer required. Requested-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-u6odkgkw8wz3m7orgsjfo5pi@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f5f0cc91518c..45943b2fa82b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1151,7 +1151,6 @@ static void task_numa_compare(struct task_numa_env *env, struct rq *src_rq = cpu_rq(env->src_cpu); struct rq *dst_rq = cpu_rq(env->dst_cpu); struct task_struct *cur; - struct task_group *tg; long src_load, dst_load; long load; long imp = env->p->numa_group ? groupimp : taskimp; @@ -1223,14 +1222,9 @@ static void task_numa_compare(struct task_numa_env *env, * In the overloaded case, try and keep the load balanced. */ balance: - src_load = env->src_stats.load; - dst_load = env->dst_stats.load; - - /* Calculate the effect of moving env->p from src to dst. */ - load = env->p->se.load.weight; - tg = task_group(env->p); - src_load += effective_load(tg, env->src_cpu, -load, -load); - dst_load += effective_load(tg, env->dst_cpu, load, load); + load = task_h_load(env->p); + dst_load = env->dst_stats.load + load; + src_load = env->src_stats.load - load; if (moveimp > imp && moveimp > env->best_imp) { /* @@ -1250,11 +1244,9 @@ balance: goto unlock; if (cur) { - /* Cur moves in the opposite direction. */ - load = cur->se.load.weight; - tg = task_group(cur); - src_load += effective_load(tg, env->src_cpu, load, load); - dst_load += effective_load(tg, env->dst_cpu, -load, -load); + load = task_h_load(cur); + dst_load -= load; + src_load += load; } if (load_too_imbalanced(src_load, dst_load, env)) -- cgit v1.2.3 From cd3bd4e628a6d57d66afe77835fe8d93ae3e41f8 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Mon, 28 Jul 2014 12:38:06 +0900 Subject: sched/fair: Fix 'make xmldocs' warning caused by missing description This patch fix following warning caused by missing description "overload" in kernel/sched/fair.c Warning(.//kernel/sched/fair.c:5906): No description found for parameter 'overload' Signed-off-by: Masanari Iida Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1406518686-7274-1-git-send-email-standby24x7@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 45943b2fa82b..bfa3c86d0d68 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5898,6 +5898,7 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. + * @overload: Indicate more than one runnable task for any CPU. */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, -- cgit v1.2.3