From edde96eafc91a510f404e7b82cfc0ecb608505ee Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Sat, 4 Aug 2012 11:49:47 +0300 Subject: sched: Document schedule() entry points This patch adds a comment on top of the schedule() function to explain to scheduler newbies how the main scheduler function is entered. Acked-by: Randy Dunlap Explained-by: Ingo Molnar Explained-by: Peter Zijlstra Signed-off-by: Pekka Enberg Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1344070187-2420-1-git-send-email-penberg@kernel.org Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fbf1fd098dc6..c9a3655e572d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3367,6 +3367,40 @@ pick_next_task(struct rq *rq) /* * __schedule() is the main scheduler function. + * + * The main means of driving the scheduler and thus entering this function are: + * + * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. + * + * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return + * paths. For example, see arch/x86/entry_64.S. + * + * To drive preemption between tasks, the scheduler sets the flag in timer + * interrupt handler scheduler_tick(). + * + * 3. Wakeups don't really cause entry into schedule(). They add a + * task to the run-queue and that's it. + * + * Now, if the new task added to the run-queue preempts the current + * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets + * called on the nearest possible occasion: + * + * - If the kernel is preemptible (CONFIG_PREEMPT=y): + * + * - in syscall or exception context, at the next outmost + * preempt_enable(). (this might be as soon as the wake_up()'s + * spin_unlock()!) + * + * - in IRQ context, return from interrupt-handler to + * preemptible context + * + * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) + * then at the next: + * + * - cond_resched() call + * - explicit schedule() call + * - return from syscall or exception to user-space + * - return from interrupt-handler to user-space */ static void __sched __schedule(void) { -- cgit v1.2.3 From f03542a7019c600163ac4441d8a826c92c1bd510 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 26 Jul 2012 08:55:34 +0800 Subject: sched: recover SD_WAKE_AFFINE in select_task_rq_fair and code clean up Since power saving code was removed from sched now, the implement code is out of service in this function, and even pollute other logical. like, 'want_sd' never has chance to be set '0', that remove the effect of SD_WAKE_AFFINE here. So, clean up the obsolete code, includes SD_PREFER_LOCAL. Signed-off-by: Alex Shi Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/5028F431.6000306@intel.com Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 1 - include/linux/topology.h | 2 -- kernel/sched/core.c | 1 - kernel/sched/fair.c | 34 +++------------------------------- 4 files changed, 3 insertions(+), 35 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index b8c86648a2f9..f3eebc121ebc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -860,7 +860,6 @@ enum cpu_idle_type { #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ -#define SD_PREFER_LOCAL 0x0040 /* Prefer to keep tasks local to this domain */ #define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ diff --git a/include/linux/topology.h b/include/linux/topology.h index fec12d667211..d3cf0d6e7712 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -129,7 +129,6 @@ int arch_update_cpu_topology(void); | 1*SD_BALANCE_FORK \ | 0*SD_BALANCE_WAKE \ | 1*SD_WAKE_AFFINE \ - | 0*SD_PREFER_LOCAL \ | 0*SD_SHARE_CPUPOWER \ | 1*SD_SHARE_PKG_RESOURCES \ | 0*SD_SERIALIZE \ @@ -160,7 +159,6 @@ int arch_update_cpu_topology(void); | 1*SD_BALANCE_FORK \ | 0*SD_BALANCE_WAKE \ | 1*SD_WAKE_AFFINE \ - | 0*SD_PREFER_LOCAL \ | 0*SD_SHARE_CPUPOWER \ | 0*SD_SHARE_PKG_RESOURCES \ | 0*SD_SERIALIZE \ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c9a3655e572d..4376c9f34790 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6622,7 +6622,6 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | 0*SD_BALANCE_FORK | 0*SD_BALANCE_WAKE | 0*SD_WAKE_AFFINE - | 0*SD_PREFER_LOCAL | 0*SD_SHARE_CPUPOWER | 0*SD_SHARE_PKG_RESOURCES | 1*SD_SERIALIZE diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 287bfaca6420..01d3eda6b7f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2686,7 +2686,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) int prev_cpu = task_cpu(p); int new_cpu = cpu; int want_affine = 0; - int want_sd = 1; int sync = wake_flags & WF_SYNC; if (p->nr_cpus_allowed == 1) @@ -2703,27 +2702,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) if (!(tmp->flags & SD_LOAD_BALANCE)) continue; - /* - * If power savings logic is enabled for a domain, see if we - * are not overloaded, if so, don't balance wider. - */ - if (tmp->flags & (SD_PREFER_LOCAL)) { - unsigned long power = 0; - unsigned long nr_running = 0; - unsigned long capacity; - int i; - - for_each_cpu(i, sched_domain_span(tmp)) { - power += power_of(i); - nr_running += cpu_rq(i)->cfs.nr_running; - } - - capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); - - if (nr_running < capacity) - want_sd = 0; - } - /* * If both cpu and prev_cpu are part of this domain, * cpu is a valid SD_WAKE_AFFINE target. @@ -2731,21 +2709,15 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { affine_sd = tmp; - want_affine = 0; - } - - if (!want_sd && !want_affine) break; + } - if (!(tmp->flags & sd_flag)) - continue; - - if (want_sd) + if (tmp->flags & sd_flag) sd = tmp; } if (affine_sd) { - if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) + if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) prev_cpu = cpu; new_cpu = select_idle_sibling(p, prev_cpu); -- cgit v1.2.3 From 73fbec604432e1fbfeb1bc59a110dac1f98160f6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 16 Jun 2012 15:57:37 +0200 Subject: sched: Move cputime code to its own file Extract cputime code from the giant sched/core.c and put it in its own file. This make it easier to deal with this particular area and de-bloat a bit more core.c Signed-off-by: Frederic Weisbecker Acked-by: Martin Schwidefsky Cc: Tony Luck Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra --- kernel/sched/Makefile | 2 +- kernel/sched/core.c | 557 +------------------------------------------------ kernel/sched/cputime.c | 504 ++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 63 ++++++ 4 files changed, 570 insertions(+), 556 deletions(-) create mode 100644 kernel/sched/cputime.c (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 173ea52f3af0..f06d249e103b 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o +obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o obj-$(CONFIG_SMP) += cpupri.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4376c9f34790..ae3bcaa3afbf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -740,126 +740,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) dequeue_task(rq, p, flags); } -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - -/* - * There are no locks covering percpu hardirq/softirq time. - * They are only modified in account_system_vtime, on corresponding CPU - * with interrupts disabled. So, writes are safe. - * They are read and saved off onto struct rq in update_rq_clock(). - * This may result in other CPU reading this CPU's irq time and can - * race with irq/account_system_vtime on this CPU. We would either get old - * or new value with a side effect of accounting a slice of irq time to wrong - * task when irq is in progress while we read rq->clock. That is a worthy - * compromise in place of having locks on each irq in account_system_time. - */ -static DEFINE_PER_CPU(u64, cpu_hardirq_time); -static DEFINE_PER_CPU(u64, cpu_softirq_time); - -static DEFINE_PER_CPU(u64, irq_start_time); -static int sched_clock_irqtime; - -void enable_sched_clock_irqtime(void) -{ - sched_clock_irqtime = 1; -} - -void disable_sched_clock_irqtime(void) -{ - sched_clock_irqtime = 0; -} - -#ifndef CONFIG_64BIT -static DEFINE_PER_CPU(seqcount_t, irq_time_seq); - -static inline void irq_time_write_begin(void) -{ - __this_cpu_inc(irq_time_seq.sequence); - smp_wmb(); -} - -static inline void irq_time_write_end(void) -{ - smp_wmb(); - __this_cpu_inc(irq_time_seq.sequence); -} - -static inline u64 irq_time_read(int cpu) -{ - u64 irq_time; - unsigned seq; - - do { - seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); - irq_time = per_cpu(cpu_softirq_time, cpu) + - per_cpu(cpu_hardirq_time, cpu); - } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); - - return irq_time; -} -#else /* CONFIG_64BIT */ -static inline void irq_time_write_begin(void) -{ -} - -static inline void irq_time_write_end(void) -{ -} - -static inline u64 irq_time_read(int cpu) -{ - return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); -} -#endif /* CONFIG_64BIT */ - -/* - * Called before incrementing preempt_count on {soft,}irq_enter - * and before decrementing preempt_count on {soft,}irq_exit. - */ -void account_system_vtime(struct task_struct *curr) -{ - unsigned long flags; - s64 delta; - int cpu; - - if (!sched_clock_irqtime) - return; - - local_irq_save(flags); - - cpu = smp_processor_id(); - delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); - __this_cpu_add(irq_start_time, delta); - - irq_time_write_begin(); - /* - * We do not account for softirq time from ksoftirqd here. - * We want to continue accounting softirq time to ksoftirqd thread - * in that case, so as not to confuse scheduler with a special task - * that do not consume any time, but still wants to run. - */ - if (hardirq_count()) - __this_cpu_add(cpu_hardirq_time, delta); - else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) - __this_cpu_add(cpu_softirq_time, delta); - - irq_time_write_end(); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(account_system_vtime); - -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ - -#ifdef CONFIG_PARAVIRT -static inline u64 steal_ticks(u64 steal) -{ - if (unlikely(steal > NSEC_PER_SEC)) - return div_u64(steal, TICK_NSEC); - - return __iter_div_u64_rem(steal, TICK_NSEC, &steal); -} -#endif - static void update_rq_clock_task(struct rq *rq, s64 delta) { /* @@ -920,43 +800,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) #endif } -#ifdef CONFIG_IRQ_TIME_ACCOUNTING -static int irqtime_account_hi_update(void) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - unsigned long flags; - u64 latest_ns; - int ret = 0; - - local_irq_save(flags); - latest_ns = this_cpu_read(cpu_hardirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) - ret = 1; - local_irq_restore(flags); - return ret; -} - -static int irqtime_account_si_update(void) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - unsigned long flags; - u64 latest_ns; - int ret = 0; - - local_irq_save(flags); - latest_ns = this_cpu_read(cpu_softirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) - ret = 1; - local_irq_restore(flags); - return ret; -} - -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ - -#define sched_clock_irqtime (0) - -#endif - void sched_set_stop_task(int cpu, struct task_struct *stop) { struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; @@ -2809,404 +2652,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } -#ifdef CONFIG_CGROUP_CPUACCT -struct cgroup_subsys cpuacct_subsys; -struct cpuacct root_cpuacct; -#endif - -static inline void task_group_account_field(struct task_struct *p, int index, - u64 tmp) -{ -#ifdef CONFIG_CGROUP_CPUACCT - struct kernel_cpustat *kcpustat; - struct cpuacct *ca; -#endif - /* - * Since all updates are sure to touch the root cgroup, we - * get ourselves ahead and touch it first. If the root cgroup - * is the only cgroup, then nothing else should be necessary. - * - */ - __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; - -#ifdef CONFIG_CGROUP_CPUACCT - if (unlikely(!cpuacct_subsys.active)) - return; - - rcu_read_lock(); - ca = task_ca(p); - while (ca && (ca != &root_cpuacct)) { - kcpustat = this_cpu_ptr(ca->cpustat); - kcpustat->cpustat[index] += tmp; - ca = parent_ca(ca); - } - rcu_read_unlock(); -#endif -} - - -/* - * Account user cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in user space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - */ -void account_user_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) -{ - int index; - - /* Add user time to process. */ - p->utime += cputime; - p->utimescaled += cputime_scaled; - account_group_user_time(p, cputime); - - index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; - - /* Add user time to cpustat. */ - task_group_account_field(p, index, (__force u64) cputime); - - /* Account for user time used */ - acct_update_integrals(p); -} - -/* - * Account guest cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in virtual machine since the last update - * @cputime_scaled: cputime scaled by cpu frequency - */ -static void account_guest_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - - /* Add guest time to process. */ - p->utime += cputime; - p->utimescaled += cputime_scaled; - account_group_user_time(p, cputime); - p->gtime += cputime; - - /* Add guest time to cpustat. */ - if (TASK_NICE(p) > 0) { - cpustat[CPUTIME_NICE] += (__force u64) cputime; - cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; - } else { - cpustat[CPUTIME_USER] += (__force u64) cputime; - cpustat[CPUTIME_GUEST] += (__force u64) cputime; - } -} - -/* - * Account system cpu time to a process and desired cpustat field - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - * @target_cputime64: pointer to cpustat field that has to be updated - */ -static inline -void __account_system_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled, int index) -{ - /* Add system time to process. */ - p->stime += cputime; - p->stimescaled += cputime_scaled; - account_group_system_time(p, cputime); - - /* Add system time to cpustat. */ - task_group_account_field(p, index, (__force u64) cputime); - - /* Account for system time used */ - acct_update_integrals(p); -} - -/* - * Account system cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @hardirq_offset: the offset to subtract from hardirq_count() - * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - */ -void account_system_time(struct task_struct *p, int hardirq_offset, - cputime_t cputime, cputime_t cputime_scaled) -{ - int index; - - if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { - account_guest_time(p, cputime, cputime_scaled); - return; - } - - if (hardirq_count() - hardirq_offset) - index = CPUTIME_IRQ; - else if (in_serving_softirq()) - index = CPUTIME_SOFTIRQ; - else - index = CPUTIME_SYSTEM; - - __account_system_time(p, cputime, cputime_scaled, index); -} - -/* - * Account for involuntary wait time. - * @cputime: the cpu time spent in involuntary wait - */ -void account_steal_time(cputime_t cputime) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - - cpustat[CPUTIME_STEAL] += (__force u64) cputime; -} - -/* - * Account for idle time. - * @cputime: the cpu time spent in idle wait - */ -void account_idle_time(cputime_t cputime) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - struct rq *rq = this_rq(); - - if (atomic_read(&rq->nr_iowait) > 0) - cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; - else - cpustat[CPUTIME_IDLE] += (__force u64) cputime; -} - -static __always_inline bool steal_account_process_tick(void) -{ -#ifdef CONFIG_PARAVIRT - if (static_key_false(¶virt_steal_enabled)) { - u64 steal, st = 0; - - steal = paravirt_steal_clock(smp_processor_id()); - steal -= this_rq()->prev_steal_time; - - st = steal_ticks(steal); - this_rq()->prev_steal_time += st * TICK_NSEC; - - account_steal_time(st); - return st; - } -#endif - return false; -} - -#ifndef CONFIG_VIRT_CPU_ACCOUNTING - -#ifdef CONFIG_IRQ_TIME_ACCOUNTING -/* - * Account a tick to a process and cpustat - * @p: the process that the cpu time gets accounted to - * @user_tick: is the tick from userspace - * @rq: the pointer to rq - * - * Tick demultiplexing follows the order - * - pending hardirq update - * - pending softirq update - * - user_time - * - idle_time - * - system time - * - check for guest_time - * - else account as system_time - * - * Check for hardirq is done both for system and user time as there is - * no timer going off while we are on hardirq and hence we may never get an - * opportunity to update it solely in system time. - * p->stime and friends are only updated on system time and not on irq - * softirq as those do not count in task exec_runtime any more. - */ -static void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq) -{ - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); - u64 *cpustat = kcpustat_this_cpu->cpustat; - - if (steal_account_process_tick()) - return; - - if (irqtime_account_hi_update()) { - cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; - } else if (irqtime_account_si_update()) { - cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; - } else if (this_cpu_ksoftirqd() == p) { - /* - * ksoftirqd time do not get accounted in cpu_softirq_time. - * So, we have to handle it separately here. - * Also, p->stime needs to be updated for ksoftirqd. - */ - __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - CPUTIME_SOFTIRQ); - } else if (user_tick) { - account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); - } else if (p == rq->idle) { - account_idle_time(cputime_one_jiffy); - } else if (p->flags & PF_VCPU) { /* System time or guest time */ - account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); - } else { - __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - CPUTIME_SYSTEM); - } -} - -static void irqtime_account_idle_ticks(int ticks) -{ - int i; - struct rq *rq = this_rq(); - - for (i = 0; i < ticks; i++) - irqtime_account_process_tick(current, 0, rq); -} -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ -static void irqtime_account_idle_ticks(int ticks) {} -static void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq) {} -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ - -/* - * Account a single tick of cpu time. - * @p: the process that the cpu time gets accounted to - * @user_tick: indicates if the tick is a user or a system tick - */ -void account_process_tick(struct task_struct *p, int user_tick) -{ - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); - struct rq *rq = this_rq(); - - if (sched_clock_irqtime) { - irqtime_account_process_tick(p, user_tick, rq); - return; - } - - if (steal_account_process_tick()) - return; - - if (user_tick) - account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); - else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) - account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, - one_jiffy_scaled); - else - account_idle_time(cputime_one_jiffy); -} - -/* - * Account multiple ticks of steal time. - * @p: the process from which the cpu time has been stolen - * @ticks: number of stolen ticks - */ -void account_steal_ticks(unsigned long ticks) -{ - account_steal_time(jiffies_to_cputime(ticks)); -} - -/* - * Account multiple ticks of idle time. - * @ticks: number of stolen ticks - */ -void account_idle_ticks(unsigned long ticks) -{ - - if (sched_clock_irqtime) { - irqtime_account_idle_ticks(ticks); - return; - } - - account_idle_time(jiffies_to_cputime(ticks)); -} - -#endif - -/* - * Use precise platform statistics if available: - */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - *ut = p->utime; - *st = p->stime; -} - -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - struct task_cputime cputime; - - thread_group_cputime(p, &cputime); - - *ut = cputime.utime; - *st = cputime.stime; -} -#else - -#ifndef nsecs_to_cputime -# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) -#endif - -static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) -{ - u64 temp = (__force u64) rtime; - - temp *= (__force u64) utime; - - if (sizeof(cputime_t) == 4) - temp = div_u64(temp, (__force u32) total); - else - temp = div64_u64(temp, (__force u64) total); - - return (__force cputime_t) temp; -} - -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - cputime_t rtime, utime = p->utime, total = utime + p->stime; - - /* - * Use CFS's precise accounting: - */ - rtime = nsecs_to_cputime(p->se.sum_exec_runtime); - - if (total) - utime = scale_utime(utime, rtime, total); - else - utime = rtime; - - /* - * Compare with previous values, to keep monotonicity: - */ - p->prev_utime = max(p->prev_utime, utime); - p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); - - *ut = p->prev_utime; - *st = p->prev_stime; -} - -/* - * Must be called with siglock held. - */ -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - struct signal_struct *sig = p->signal; - struct task_cputime cputime; - cputime_t rtime, utime, total; - - thread_group_cputime(p, &cputime); - - total = cputime.utime + cputime.stime; - rtime = nsecs_to_cputime(cputime.sum_exec_runtime); - - if (total) - utime = scale_utime(cputime.utime, rtime, total); - else - utime = rtime; - - sig->prev_utime = max(sig->prev_utime, utime); - sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); - - *ut = sig->prev_utime; - *st = sig->prev_stime; -} -#endif - /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -8419,6 +7864,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { * (balbir@in.ibm.com). */ +struct cpuacct root_cpuacct; + /* create a new cpu accounting group */ static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) { diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c new file mode 100644 index 000000000000..372692bd5376 --- /dev/null +++ b/kernel/sched/cputime.c @@ -0,0 +1,504 @@ +#include +#include +#include +#include +#include +#include "sched.h" + + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +/* + * There are no locks covering percpu hardirq/softirq time. + * They are only modified in account_system_vtime, on corresponding CPU + * with interrupts disabled. So, writes are safe. + * They are read and saved off onto struct rq in update_rq_clock(). + * This may result in other CPU reading this CPU's irq time and can + * race with irq/account_system_vtime on this CPU. We would either get old + * or new value with a side effect of accounting a slice of irq time to wrong + * task when irq is in progress while we read rq->clock. That is a worthy + * compromise in place of having locks on each irq in account_system_time. + */ +DEFINE_PER_CPU(u64, cpu_hardirq_time); +DEFINE_PER_CPU(u64, cpu_softirq_time); + +static DEFINE_PER_CPU(u64, irq_start_time); +static int sched_clock_irqtime; + +void enable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 1; +} + +void disable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 0; +} + +#ifndef CONFIG_64BIT +DEFINE_PER_CPU(seqcount_t, irq_time_seq); +#endif /* CONFIG_64BIT */ + +/* + * Called before incrementing preempt_count on {soft,}irq_enter + * and before decrementing preempt_count on {soft,}irq_exit. + */ +void account_system_vtime(struct task_struct *curr) +{ + unsigned long flags; + s64 delta; + int cpu; + + if (!sched_clock_irqtime) + return; + + local_irq_save(flags); + + cpu = smp_processor_id(); + delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); + __this_cpu_add(irq_start_time, delta); + + irq_time_write_begin(); + /* + * We do not account for softirq time from ksoftirqd here. + * We want to continue accounting softirq time to ksoftirqd thread + * in that case, so as not to confuse scheduler with a special task + * that do not consume any time, but still wants to run. + */ + if (hardirq_count()) + __this_cpu_add(cpu_hardirq_time, delta); + else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) + __this_cpu_add(cpu_softirq_time, delta); + + irq_time_write_end(); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(account_system_vtime); + +static int irqtime_account_hi_update(void) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_hardirq_time); + if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) + ret = 1; + local_irq_restore(flags); + return ret; +} + +static int irqtime_account_si_update(void) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_softirq_time); + if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) + ret = 1; + local_irq_restore(flags); + return ret; +} + +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#define sched_clock_irqtime (0) + +#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ + +static inline void task_group_account_field(struct task_struct *p, int index, + u64 tmp) +{ +#ifdef CONFIG_CGROUP_CPUACCT + struct kernel_cpustat *kcpustat; + struct cpuacct *ca; +#endif + /* + * Since all updates are sure to touch the root cgroup, we + * get ourselves ahead and touch it first. If the root cgroup + * is the only cgroup, then nothing else should be necessary. + * + */ + __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; + +#ifdef CONFIG_CGROUP_CPUACCT + if (unlikely(!cpuacct_subsys.active)) + return; + + rcu_read_lock(); + ca = task_ca(p); + while (ca && (ca != &root_cpuacct)) { + kcpustat = this_cpu_ptr(ca->cpustat); + kcpustat->cpustat[index] += tmp; + ca = parent_ca(ca); + } + rcu_read_unlock(); +#endif +} + +/* + * Account user cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in user space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +void account_user_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) +{ + int index; + + /* Add user time to process. */ + p->utime += cputime; + p->utimescaled += cputime_scaled; + account_group_user_time(p, cputime); + + index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; + + /* Add user time to cpustat. */ + task_group_account_field(p, index, (__force u64) cputime); + + /* Account for user time used */ + acct_update_integrals(p); +} + +/* + * Account guest cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in virtual machine since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +static void account_guest_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + + /* Add guest time to process. */ + p->utime += cputime; + p->utimescaled += cputime_scaled; + account_group_user_time(p, cputime); + p->gtime += cputime; + + /* Add guest time to cpustat. */ + if (TASK_NICE(p) > 0) { + cpustat[CPUTIME_NICE] += (__force u64) cputime; + cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; + } else { + cpustat[CPUTIME_USER] += (__force u64) cputime; + cpustat[CPUTIME_GUEST] += (__force u64) cputime; + } +} + +/* + * Account system cpu time to a process and desired cpustat field + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + * @target_cputime64: pointer to cpustat field that has to be updated + */ +static inline +void __account_system_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled, int index) +{ + /* Add system time to process. */ + p->stime += cputime; + p->stimescaled += cputime_scaled; + account_group_system_time(p, cputime); + + /* Add system time to cpustat. */ + task_group_account_field(p, index, (__force u64) cputime); + + /* Account for system time used */ + acct_update_integrals(p); +} + +/* + * Account system cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +void account_system_time(struct task_struct *p, int hardirq_offset, + cputime_t cputime, cputime_t cputime_scaled) +{ + int index; + + if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { + account_guest_time(p, cputime, cputime_scaled); + return; + } + + if (hardirq_count() - hardirq_offset) + index = CPUTIME_IRQ; + else if (in_serving_softirq()) + index = CPUTIME_SOFTIRQ; + else + index = CPUTIME_SYSTEM; + + __account_system_time(p, cputime, cputime_scaled, index); +} + +/* + * Account for involuntary wait time. + * @cputime: the cpu time spent in involuntary wait + */ +void account_steal_time(cputime_t cputime) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + + cpustat[CPUTIME_STEAL] += (__force u64) cputime; +} + +/* + * Account for idle time. + * @cputime: the cpu time spent in idle wait + */ +void account_idle_time(cputime_t cputime) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + struct rq *rq = this_rq(); + + if (atomic_read(&rq->nr_iowait) > 0) + cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; + else + cpustat[CPUTIME_IDLE] += (__force u64) cputime; +} + +static __always_inline bool steal_account_process_tick(void) +{ +#ifdef CONFIG_PARAVIRT + if (static_key_false(¶virt_steal_enabled)) { + u64 steal, st = 0; + + steal = paravirt_steal_clock(smp_processor_id()); + steal -= this_rq()->prev_steal_time; + + st = steal_ticks(steal); + this_rq()->prev_steal_time += st * TICK_NSEC; + + account_steal_time(st); + return st; + } +#endif + return false; +} + +#ifndef CONFIG_VIRT_CPU_ACCOUNTING + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +/* + * Account a tick to a process and cpustat + * @p: the process that the cpu time gets accounted to + * @user_tick: is the tick from userspace + * @rq: the pointer to rq + * + * Tick demultiplexing follows the order + * - pending hardirq update + * - pending softirq update + * - user_time + * - idle_time + * - system time + * - check for guest_time + * - else account as system_time + * + * Check for hardirq is done both for system and user time as there is + * no timer going off while we are on hardirq and hence we may never get an + * opportunity to update it solely in system time. + * p->stime and friends are only updated on system time and not on irq + * softirq as those do not count in task exec_runtime any more. + */ +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, + struct rq *rq) +{ + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + u64 *cpustat = kcpustat_this_cpu->cpustat; + + if (steal_account_process_tick()) + return; + + if (irqtime_account_hi_update()) { + cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; + } else if (irqtime_account_si_update()) { + cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; + } else if (this_cpu_ksoftirqd() == p) { + /* + * ksoftirqd time do not get accounted in cpu_softirq_time. + * So, we have to handle it separately here. + * Also, p->stime needs to be updated for ksoftirqd. + */ + __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, + CPUTIME_SOFTIRQ); + } else if (user_tick) { + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + } else if (p == rq->idle) { + account_idle_time(cputime_one_jiffy); + } else if (p->flags & PF_VCPU) { /* System time or guest time */ + account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); + } else { + __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, + CPUTIME_SYSTEM); + } +} + +static void irqtime_account_idle_ticks(int ticks) +{ + int i; + struct rq *rq = this_rq(); + + for (i = 0; i < ticks; i++) + irqtime_account_process_tick(current, 0, rq); +} +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ +static void irqtime_account_idle_ticks(int ticks) {} +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, + struct rq *rq) {} +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +/* + * Account a single tick of cpu time. + * @p: the process that the cpu time gets accounted to + * @user_tick: indicates if the tick is a user or a system tick + */ +void account_process_tick(struct task_struct *p, int user_tick) +{ + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + struct rq *rq = this_rq(); + + if (sched_clock_irqtime) { + irqtime_account_process_tick(p, user_tick, rq); + return; + } + + if (steal_account_process_tick()) + return; + + if (user_tick) + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) + account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, + one_jiffy_scaled); + else + account_idle_time(cputime_one_jiffy); +} + +/* + * Account multiple ticks of steal time. + * @p: the process from which the cpu time has been stolen + * @ticks: number of stolen ticks + */ +void account_steal_ticks(unsigned long ticks) +{ + account_steal_time(jiffies_to_cputime(ticks)); +} + +/* + * Account multiple ticks of idle time. + * @ticks: number of stolen ticks + */ +void account_idle_ticks(unsigned long ticks) +{ + + if (sched_clock_irqtime) { + irqtime_account_idle_ticks(ticks); + return; + } + + account_idle_time(jiffies_to_cputime(ticks)); +} + +#endif + +/* + * Use precise platform statistics if available: + */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + *ut = p->utime; + *st = p->stime; +} + +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + + *ut = cputime.utime; + *st = cputime.stime; +} +#else + +#ifndef nsecs_to_cputime +# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) +#endif + +static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) +{ + u64 temp = (__force u64) rtime; + + temp *= (__force u64) utime; + + if (sizeof(cputime_t) == 4) + temp = div_u64(temp, (__force u32) total); + else + temp = div64_u64(temp, (__force u64) total); + + return (__force cputime_t) temp; +} + +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + cputime_t rtime, utime = p->utime, total = utime + p->stime; + + /* + * Use CFS's precise accounting: + */ + rtime = nsecs_to_cputime(p->se.sum_exec_runtime); + + if (total) + utime = scale_utime(utime, rtime, total); + else + utime = rtime; + + /* + * Compare with previous values, to keep monotonicity: + */ + p->prev_utime = max(p->prev_utime, utime); + p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); + + *ut = p->prev_utime; + *st = p->prev_stime; +} + +/* + * Must be called with siglock held. + */ +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct signal_struct *sig = p->signal; + struct task_cputime cputime; + cputime_t rtime, utime, total; + + thread_group_cputime(p, &cputime); + + total = cputime.utime + cputime.stime; + rtime = nsecs_to_cputime(cputime.sum_exec_runtime); + + if (total) + utime = scale_utime(cputime.utime, rtime, total); + else + utime = rtime; + + sig->prev_utime = max(sig->prev_utime, utime); + sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); + + *ut = sig->prev_utime; + *st = sig->prev_stime; +} +#endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f6714d009e77..804c2e5e7872 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -891,6 +891,9 @@ struct cpuacct { struct kernel_cpustat __percpu *cpustat; }; +extern struct cgroup_subsys cpuacct_subsys; +extern struct cpuacct root_cpuacct; + /* return cpu accounting group corresponding to this container */ static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) { @@ -917,6 +920,16 @@ extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} #endif +#ifdef CONFIG_PARAVIRT +static inline u64 steal_ticks(u64 steal) +{ + if (unlikely(steal > NSEC_PER_SEC)) + return div_u64(steal, TICK_NSEC); + + return __iter_div_u64_rem(steal, TICK_NSEC, &steal); +} +#endif + static inline void inc_nr_running(struct rq *rq) { rq->nr_running++; @@ -1157,3 +1170,53 @@ enum rq_nohz_flag_bits { #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) #endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +DECLARE_PER_CPU(u64, cpu_hardirq_time); +DECLARE_PER_CPU(u64, cpu_softirq_time); + +#ifndef CONFIG_64BIT +DECLARE_PER_CPU(seqcount_t, irq_time_seq); + +static inline void irq_time_write_begin(void) +{ + __this_cpu_inc(irq_time_seq.sequence); + smp_wmb(); +} + +static inline void irq_time_write_end(void) +{ + smp_wmb(); + __this_cpu_inc(irq_time_seq.sequence); +} + +static inline u64 irq_time_read(int cpu) +{ + u64 irq_time; + unsigned seq; + + do { + seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); + irq_time = per_cpu(cpu_softirq_time, cpu) + + per_cpu(cpu_hardirq_time, cpu); + } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); + + return irq_time; +} +#else /* CONFIG_64BIT */ +static inline void irq_time_write_begin(void) +{ +} + +static inline void irq_time_write_end(void) +{ +} + +static inline u64 irq_time_read(int cpu) +{ + return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); +} +#endif /* CONFIG_64BIT */ +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + -- cgit v1.2.3 From baa36046d09ea6dbc122c795566992318663d9eb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 18 Jun 2012 17:54:14 +0200 Subject: cputime: Consolidate vtime handling on context switch The archs that implement virtual cputime accounting all flush the cputime of a task when it gets descheduled and sometimes set up some ground initialization for the next task to account its cputime. These archs all put their own hooks in their context switch callbacks and handle the off-case themselves. Consolidate this by creating a new account_switch_vtime() callback called in generic code right after a context switch and that these archs must implement to flush the prev task cputime and initialize the next task cputime related state. Signed-off-by: Frederic Weisbecker Acked-by: Martin Schwidefsky Cc: Tony Luck Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra --- arch/ia64/include/asm/switch_to.h | 8 -------- arch/ia64/kernel/time.c | 4 ++-- arch/powerpc/include/asm/time.h | 6 ------ arch/powerpc/kernel/process.c | 3 --- arch/powerpc/kernel/time.c | 6 ++++++ arch/s390/include/asm/switch_to.h | 2 -- arch/s390/kernel/vtime.c | 4 ++-- include/linux/kernel_stat.h | 6 ++++++ kernel/sched/core.c | 1 + 9 files changed, 17 insertions(+), 23 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/arch/ia64/include/asm/switch_to.h b/arch/ia64/include/asm/switch_to.h index cb2412fcd17f..d38c7ea5eea5 100644 --- a/arch/ia64/include/asm/switch_to.h +++ b/arch/ia64/include/asm/switch_to.h @@ -30,13 +30,6 @@ extern struct task_struct *ia64_switch_to (void *next_task); extern void ia64_save_extra (struct task_struct *task); extern void ia64_load_extra (struct task_struct *task); -#ifdef CONFIG_VIRT_CPU_ACCOUNTING -extern void ia64_account_on_switch (struct task_struct *prev, struct task_struct *next); -# define IA64_ACCOUNT_ON_SWITCH(p,n) ia64_account_on_switch(p,n) -#else -# define IA64_ACCOUNT_ON_SWITCH(p,n) -#endif - #ifdef CONFIG_PERFMON DECLARE_PER_CPU(unsigned long, pfm_syst_info); # define PERFMON_IS_SYSWIDE() (__get_cpu_var(pfm_syst_info) & 0x1) @@ -49,7 +42,6 @@ extern void ia64_account_on_switch (struct task_struct *prev, struct task_struct || PERFMON_IS_SYSWIDE()) #define __switch_to(prev,next,last) do { \ - IA64_ACCOUNT_ON_SWITCH(prev, next); \ if (IA64_HAS_EXTRA_STATE(prev)) \ ia64_save_extra(prev); \ if (IA64_HAS_EXTRA_STATE(next)) \ diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index ecc904b33c5f..6247197b9877 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -88,10 +88,10 @@ extern cputime_t cycle_to_cputime(u64 cyc); * accumulated times to the current process, and to prepare accounting on * the next process. */ -void ia64_account_on_switch(struct task_struct *prev, struct task_struct *next) +void account_switch_vtime(struct task_struct *prev) { struct thread_info *pi = task_thread_info(prev); - struct thread_info *ni = task_thread_info(next); + struct thread_info *ni = task_thread_info(current); cputime_t delta_stime, delta_utime; __u64 now; diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 3b4b4a8da922..c1f267694acb 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -197,12 +197,6 @@ struct cpu_usage { DECLARE_PER_CPU(struct cpu_usage, cpu_usage_array); -#if defined(CONFIG_VIRT_CPU_ACCOUNTING) -#define account_process_vtime(tsk) account_process_tick(tsk, 0) -#else -#define account_process_vtime(tsk) do { } while (0) -#endif - extern void secondary_cpu_time_init(void); DECLARE_PER_CPU(u64, decrementers_next_tb); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 710f400476de..d73fa999b47b 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -514,9 +514,6 @@ struct task_struct *__switch_to(struct task_struct *prev, local_irq_save(flags); - account_system_vtime(current); - account_process_vtime(current); - /* * We can't take a PMU exception inside _switch() since there is a * window where the kernel stack SLB and the kernel stack are out diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index be171ee73bf8..49da7f06e643 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -366,6 +366,12 @@ void account_process_tick(struct task_struct *tsk, int user_tick) account_user_time(tsk, utime, utimescaled); } +void account_switch_vtime(struct task_struct *prev) +{ + account_system_vtime(prev); + account_process_tick(prev, 0); +} + #else /* ! CONFIG_VIRT_CPU_ACCOUNTING */ #define calc_cputime_factors() #endif diff --git a/arch/s390/include/asm/switch_to.h b/arch/s390/include/asm/switch_to.h index f223068b7822..e7f9b3d04fa1 100644 --- a/arch/s390/include/asm/switch_to.h +++ b/arch/s390/include/asm/switch_to.h @@ -89,12 +89,10 @@ static inline void restore_access_regs(unsigned int *acrs) prev = __switch_to(prev,next); \ } while (0) -extern void account_vtime(struct task_struct *, struct task_struct *); extern void account_tick_vtime(struct task_struct *); #define finish_arch_switch(prev) do { \ set_fs(current->thread.mm_segment); \ - account_vtime(prev, current); \ } while (0) #endif /* __ASM_SWITCH_TO_H */ diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 4fc97b40a6e1..449ac22cc71b 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -99,7 +99,7 @@ static int do_account_vtime(struct task_struct *tsk, int hardirq_offset) return virt_timer_forward(user + system); } -void account_vtime(struct task_struct *prev, struct task_struct *next) +void account_switch_vtime(struct task_struct *prev) { struct thread_info *ti; @@ -107,7 +107,7 @@ void account_vtime(struct task_struct *prev, struct task_struct *next) ti = task_thread_info(prev); ti->user_timer = S390_lowcore.user_timer; ti->system_timer = S390_lowcore.system_timer; - ti = task_thread_info(next); + ti = task_thread_info(current); S390_lowcore.user_timer = ti->user_timer; S390_lowcore.system_timer = ti->system_timer; } diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 2fbd9053c2df..bbe5d15d6597 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -130,4 +130,10 @@ extern void account_process_tick(struct task_struct *, int user); extern void account_steal_ticks(unsigned long ticks); extern void account_idle_ticks(unsigned long ticks); +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +extern void account_switch_vtime(struct task_struct *prev); +#else +static inline void account_switch_vtime(struct task_struct *prev) { } +#endif + #endif /* _LINUX_KERNEL_STAT_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ae3bcaa3afbf..78d9c965433a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1796,6 +1796,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) * Manfred Spraul */ prev_state = prev->state; + account_switch_vtime(prev); finish_arch_switch(prev); #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_disable(); -- cgit v1.2.3 From 201c373e8e4823700d3160d5c28e1ab18fd1193e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 16 Aug 2012 17:03:24 +0900 Subject: sched/debug: Limit sd->*_idx range on sysctl Various sd->*_idx's are used for refering the rq's load average table when selecting a cpu to run. However they can be set to any number with sysctl knobs so that it can crash the kernel if something bad is given. Fix it by limiting them into the actual range. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1345104204-8317-1-git-send-email-namhyung@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ae66229238a0..ec0f2b81b81c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4896,16 +4896,25 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) *tablep = NULL; } +static int min_load_idx = 0; +static int max_load_idx = CPU_LOAD_IDX_MAX; + static void set_table_entry(struct ctl_table *entry, const char *procname, void *data, int maxlen, - umode_t mode, proc_handler *proc_handler) + umode_t mode, proc_handler *proc_handler, + bool load_idx) { entry->procname = procname; entry->data = data; entry->maxlen = maxlen; entry->mode = mode; entry->proc_handler = proc_handler; + + if (load_idx) { + entry->extra1 = &min_load_idx; + entry->extra2 = &max_load_idx; + } } static struct ctl_table * @@ -4917,30 +4926,30 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) return NULL; set_table_entry(&table[0], "min_interval", &sd->min_interval, - sizeof(long), 0644, proc_doulongvec_minmax); + sizeof(long), 0644, proc_doulongvec_minmax, false); set_table_entry(&table[1], "max_interval", &sd->max_interval, - sizeof(long), 0644, proc_doulongvec_minmax); + sizeof(long), 0644, proc_doulongvec_minmax, false); set_table_entry(&table[2], "busy_idx", &sd->busy_idx, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[3], "idle_idx", &sd->idle_idx, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[5], "wake_idx", &sd->wake_idx, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, true); set_table_entry(&table[7], "busy_factor", &sd->busy_factor, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[9], "cache_nice_tries", &sd->cache_nice_tries, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[10], "flags", &sd->flags, - sizeof(int), 0644, proc_dointvec_minmax); + sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[11], "name", sd->name, - CORENAME_MAX_SIZE, 0444, proc_dostring); + CORENAME_MAX_SIZE, 0444, proc_dostring, false); /* &table[12] is terminator */ return table; -- cgit v1.2.3 From 38b8dd6f87398524d02c21ff614c507ba8c9d295 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 3 Jul 2012 14:34:02 +0800 Subject: sched: Remove useless code in yield_to() It's impossible to enter the else branch if we have set skip_clock_update in task_yield_fair(), as yield_to_task_fair() will directly return true after invoke task_yield_fair(). Signed-off-by: Michael Wang Acked-by: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4FF2925A.9060005@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ec0f2b81b81c..c46a011ce5db 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4348,13 +4348,6 @@ again: */ if (preempt && rq != p_rq) resched_task(p_rq->curr); - } else { - /* - * We might have set it in task_yield_fair(), but are - * not going to schedule(), so don't want to skip - * the next update. - */ - rq->skip_clock_update = 0; } out: -- cgit v1.2.3 From f3e947867478af9a12b9956bcd000ac7613a8a95 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 12 Sep 2012 11:22:00 +0200 Subject: sched: Remove __ARCH_WANT_INTERRUPTS_ON_CTXSW Now that the last architecture to use this has stopped doing so (ARM, thanks Catalin!) we can remove this complexity from the scheduler core. Signed-off-by: Peter Zijlstra Cc: Oleg Nesterov Cc: Catalin Marinas Link: http://lkml.kernel.org/n/tip-g9p2a1w81xxbrze25v9zpzbf@git.kernel.org Signed-off-by: Ingo Molnar --- Documentation/scheduler/sched-arch.txt | 10 --------- include/linux/sched.h | 5 ----- kernel/fork.c | 4 ---- kernel/sched/core.c | 40 +--------------------------------- kernel/sched/rt.c | 5 ----- kernel/sched/sched.h | 6 ----- 6 files changed, 1 insertion(+), 69 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/Documentation/scheduler/sched-arch.txt b/Documentation/scheduler/sched-arch.txt index 28aa1075e291..b1b8587b86f0 100644 --- a/Documentation/scheduler/sched-arch.txt +++ b/Documentation/scheduler/sched-arch.txt @@ -17,16 +17,6 @@ you must `#define __ARCH_WANT_UNLOCKED_CTXSW` in a header file Unlocked context switches introduce only a very minor performance penalty to the core scheduler implementation in the CONFIG_SMP case. -2. Interrupt status -By default, the switch_to arch function is called with interrupts -disabled. Interrupts may be enabled over the call if it is likely to -introduce a significant interrupt latency by adding the line -`#define __ARCH_WANT_INTERRUPTS_ON_CTXSW` in the same place as for -unlocked context switches. This define also implies -`__ARCH_WANT_UNLOCKED_CTXSW`. See arch/arm/include/asm/system.h for an -example. - - CPU idle ======== Your cpu_idle routines need to obey the following rules: diff --git a/include/linux/sched.h b/include/linux/sched.h index f3eebc121ebc..60e5e38eee2a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -678,11 +678,6 @@ struct signal_struct { * (notably. ptrace) */ }; -/* Context switch must be unlocked if interrupts are to be enabled */ -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW -# define __ARCH_WANT_UNLOCKED_CTXSW -#endif - /* * Bits in flags field of signal_struct. */ diff --git a/kernel/fork.c b/kernel/fork.c index 2c8857e12855..743d48f4d711 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1280,11 +1280,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - p->hardirqs_enabled = 1; -#else p->hardirqs_enabled = 0; -#endif p->hardirq_enable_ip = 0; p->hardirq_enable_event = 0; p->hardirq_disable_ip = _THIS_IP_; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c46a011ce5db..8b51b2d9b1fd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1361,25 +1361,6 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) smp_send_reschedule(cpu); } -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW -static int ttwu_activate_remote(struct task_struct *p, int wake_flags) -{ - struct rq *rq; - int ret = 0; - - rq = __task_rq_lock(p); - if (p->on_cpu) { - ttwu_activate(rq, p, ENQUEUE_WAKEUP); - ttwu_do_wakeup(rq, p, wake_flags); - ret = 1; - } - __task_rq_unlock(rq); - - return ret; - -} -#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ - bool cpus_share_cache(int this_cpu, int that_cpu) { return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); @@ -1440,21 +1421,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * If the owning (remote) cpu is still in the middle of schedule() with * this task as prev, wait until its done referencing the task. */ - while (p->on_cpu) { -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - /* - * In case the architecture enables interrupts in - * context_switch(), we cannot busy wait, since that - * would lead to deadlocks when an interrupt hits and - * tries to wake up @prev. So bail and do a complete - * remote wakeup. - */ - if (ttwu_activate_remote(p, wake_flags)) - goto stat; -#else + while (p->on_cpu) cpu_relax(); -#endif - } /* * Pairs with the smp_wmb() in finish_lock_switch(). */ @@ -1798,13 +1766,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) prev_state = prev->state; account_switch_vtime(prev); finish_arch_switch(prev); -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_disable(); -#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ perf_event_task_sched_in(prev, current); -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_enable(); -#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ finish_lock_switch(rq, prev); finish_arch_post_lock_switch(); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e0b7ba9c040f..418feb01344e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1632,11 +1632,6 @@ static int push_rt_task(struct rq *rq) if (!next_task) return 0; -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - if (unlikely(task_running(rq, next_task))) - return 0; -#endif - retry: if (unlikely(next_task == rq->curr)) { WARN_ON(1); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 09871698e80c..7a7db09cfabc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -737,11 +737,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) */ next->on_cpu = 1; #endif -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - raw_spin_unlock_irq(&rq->lock); -#else raw_spin_unlock(&rq->lock); -#endif } static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) @@ -755,9 +751,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) smp_wmb(); prev->on_cpu = 0; #endif -#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_enable(); -#endif } #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ -- cgit v1.2.3 From 08bedae1d0acd8c9baf514fb69fa199d0c8345f6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 Sep 2012 00:03:50 +0200 Subject: sched: Fix load avg vs. cpu-hotplug Commit f319da0c68 ("sched: Fix load avg vs cpu-hotplug") was an incomplete fix: In particular, the problem is that at the point it calls calc_load_migrate() nr_running := 1 (the stopper thread), so move the call to CPU_DEAD where we're sure that nr_running := 0. Also note that we can call calc_load_migrate() without serialization, we know the state of rq is stable since its cpu is dead, and we modify the global state using appropriate atomic ops. Suggested-by: Paul E. McKenney Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1346882630.2600.59.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8b51b2d9b1fd..ba144b121f3d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5048,7 +5048,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) migrate_tasks(cpu); BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); + break; + case CPU_DEAD: calc_load_migrate(rq); break; #endif -- cgit v1.2.3 From 5d18023294abc22984886bd7185344e0c2be0daf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Aug 2012 11:26:57 +0200 Subject: sched: Fix load avg vs cpu-hotplug Rabik and Paul reported two different issues related to the same few lines of code. Rabik's issue is that the nr_uninterruptible migration code is wrong in that he sees artifacts due to this (Rabik please do expand in more detail). Paul's issue is that this code as it stands relies on us using stop_machine() for unplug, we all would like to remove this assumption so that eventually we can remove this stop_machine() usage altogether. The only reason we'd have to migrate nr_uninterruptible is so that we could use for_each_online_cpu() loops in favour of for_each_possible_cpu() loops, however since nr_uninterruptible() is the only such loop and its using possible lets not bother at all. The problem Rabik sees is (probably) caused by the fact that by migrating nr_uninterruptible we screw rq->calc_load_active for both rqs involved. So don't bother with fancy migration schemes (meaning we now have to keep using for_each_possible_cpu()) and instead fold any nr_active delta after we migrate all tasks away to make sure we don't have any skewed nr_active accounting. [ paulmck: Move call to calc_load_migration to CPU_DEAD to avoid miscounting noted by Rakib. ] Reported-by: Rakib Mullick Reported-by: Paul E. McKenney Signed-off-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/sched/core.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fbf1fd098dc6..8c38b5e7ce47 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5304,27 +5304,17 @@ void idle_task_exit(void) } /* - * While a dead CPU has no uninterruptible tasks queued at this point, - * it might still have a nonzero ->nr_uninterruptible counter, because - * for performance reasons the counter is not stricly tracking tasks to - * their home CPUs. So we just add the counter to another CPU's counter, - * to keep the global sum constant after CPU-down: - */ -static void migrate_nr_uninterruptible(struct rq *rq_src) -{ - struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); - - rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; - rq_src->nr_uninterruptible = 0; -} - -/* - * remove the tasks which were accounted by rq from calc_load_tasks. + * Since this CPU is going 'away' for a while, fold any nr_active delta + * we might have. Assumes we're called after migrate_tasks() so that the + * nr_active count is stable. + * + * Also see the comment "Global load-average calculations". */ -static void calc_global_load_remove(struct rq *rq) +static void calc_load_migrate(struct rq *rq) { - atomic_long_sub(rq->calc_load_active, &calc_load_tasks); - rq->calc_load_active = 0; + long delta = calc_load_fold_active(rq); + if (delta) + atomic_long_add(delta, &calc_load_tasks); } /* @@ -5617,9 +5607,18 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) migrate_tasks(cpu); BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); + break; - migrate_nr_uninterruptible(rq); - calc_global_load_remove(rq); + case CPU_DEAD: + { + struct rq *dest_rq; + + local_irq_save(flags); + dest_rq = cpu_rq(smp_processor_id()); + raw_spin_lock(&dest_rq->lock); + calc_load_migrate(rq); + raw_spin_unlock_irqrestore(&dest_rq->lock, flags); + } break; #endif } -- cgit v1.2.3 From bf9fae9f5e4ca8dce4708812f9ad6281e61df109 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Sep 2012 15:23:11 +0200 Subject: cputime: Use a proper subsystem naming for vtime related APIs Use a naming based on vtime as a prefix for virtual based cputime accounting APIs: - account_system_vtime() -> vtime_account() - account_switch_vtime() -> vtime_task_switch() It makes it easier to allow for further declension such as vtime_account_system(), vtime_account_idle(), ... if we want to find out the context we account to from generic code. This also make it better to know on which subsystem these APIs refer to. Signed-off-by: Frederic Weisbecker Cc: Tony Luck Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra --- arch/ia64/kernel/time.c | 6 +++--- arch/powerpc/kernel/time.c | 10 +++++----- arch/s390/kernel/vtime.c | 6 +++--- include/linux/hardirq.h | 8 ++++---- include/linux/kernel_stat.h | 4 ++-- include/linux/kvm_host.h | 4 ++-- kernel/sched/core.c | 2 +- kernel/sched/cputime.c | 8 ++++---- kernel/softirq.c | 6 +++--- 9 files changed, 27 insertions(+), 27 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 6247197b9877..16bb6eda879d 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -88,7 +88,7 @@ extern cputime_t cycle_to_cputime(u64 cyc); * accumulated times to the current process, and to prepare accounting on * the next process. */ -void account_switch_vtime(struct task_struct *prev) +void vtime_task_switch(struct task_struct *prev) { struct thread_info *pi = task_thread_info(prev); struct thread_info *ni = task_thread_info(current); @@ -116,7 +116,7 @@ void account_switch_vtime(struct task_struct *prev) * Account time for a transition between system, hard irq or soft irq state. * Note that this function is called with interrupts enabled. */ -void account_system_vtime(struct task_struct *tsk) +void vtime_account(struct task_struct *tsk) { struct thread_info *ti = task_thread_info(tsk); unsigned long flags; @@ -138,7 +138,7 @@ void account_system_vtime(struct task_struct *tsk) local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(account_system_vtime); +EXPORT_SYMBOL_GPL(vtime_account); /* * Called from the timer interrupt handler to charge accumulated user time diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 49da7f06e643..39899d7ebda0 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -291,7 +291,7 @@ static inline u64 calculate_stolen_time(u64 stop_tb) * Account time for a transition between system, hard irq * or soft irq state. */ -void account_system_vtime(struct task_struct *tsk) +void vtime_account(struct task_struct *tsk) { u64 now, nowscaled, delta, deltascaled; unsigned long flags; @@ -343,14 +343,14 @@ void account_system_vtime(struct task_struct *tsk) } local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(account_system_vtime); +EXPORT_SYMBOL_GPL(vtime_account); /* * Transfer the user and system times accumulated in the paca * by the exception entry and exit code to the generic process * user and system time records. * Must be called with interrupts disabled. - * Assumes that account_system_vtime() has been called recently + * Assumes that vtime_account() has been called recently * (i.e. since the last entry from usermode) so that * get_paca()->user_time_scaled is up to date. */ @@ -366,9 +366,9 @@ void account_process_tick(struct task_struct *tsk, int user_tick) account_user_time(tsk, utime, utimescaled); } -void account_switch_vtime(struct task_struct *prev) +void vtime_task_switch(struct task_struct *prev) { - account_system_vtime(prev); + vtime_account(prev); account_process_tick(prev, 0); } diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 449ac22cc71b..cb5093c26d16 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -99,7 +99,7 @@ static int do_account_vtime(struct task_struct *tsk, int hardirq_offset) return virt_timer_forward(user + system); } -void account_switch_vtime(struct task_struct *prev) +void vtime_task_switch(struct task_struct *prev) { struct thread_info *ti; @@ -122,7 +122,7 @@ void account_process_tick(struct task_struct *tsk, int user_tick) * Update process times based on virtual cpu times stored by entry.S * to the lowcore fields user_timer, system_timer & steal_clock. */ -void account_system_vtime(struct task_struct *tsk) +void vtime_account(struct task_struct *tsk) { struct thread_info *ti = task_thread_info(tsk); u64 timer, system; @@ -138,7 +138,7 @@ void account_system_vtime(struct task_struct *tsk) virt_timer_forward(system); } -EXPORT_SYMBOL_GPL(account_system_vtime); +EXPORT_SYMBOL_GPL(vtime_account); void __kprobes vtime_stop_cpu(void) { diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 305f23cd7cff..cab3da3d0949 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -132,11 +132,11 @@ extern void synchronize_irq(unsigned int irq); struct task_struct; #if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING) -static inline void account_system_vtime(struct task_struct *tsk) +static inline void vtime_account(struct task_struct *tsk) { } #else -extern void account_system_vtime(struct task_struct *tsk); +extern void vtime_account(struct task_struct *tsk); #endif #if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU) @@ -162,7 +162,7 @@ extern void rcu_nmi_exit(void); */ #define __irq_enter() \ do { \ - account_system_vtime(current); \ + vtime_account(current); \ add_preempt_count(HARDIRQ_OFFSET); \ trace_hardirq_enter(); \ } while (0) @@ -178,7 +178,7 @@ extern void irq_enter(void); #define __irq_exit() \ do { \ trace_hardirq_exit(); \ - account_system_vtime(current); \ + vtime_account(current); \ sub_preempt_count(HARDIRQ_OFFSET); \ } while (0) diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index bbe5d15d6597..ca0944b92f4a 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -131,9 +131,9 @@ extern void account_steal_ticks(unsigned long ticks); extern void account_idle_ticks(unsigned long ticks); #ifdef CONFIG_VIRT_CPU_ACCOUNTING -extern void account_switch_vtime(struct task_struct *prev); +extern void vtime_task_switch(struct task_struct *prev); #else -static inline void account_switch_vtime(struct task_struct *prev) { } +static inline void vtime_task_switch(struct task_struct *prev) { } #endif #endif /* _LINUX_KERNEL_STAT_H */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b70b48b01098..8a59e0abe5fa 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -685,7 +685,7 @@ static inline int kvm_deassign_device(struct kvm *kvm, static inline void kvm_guest_enter(void) { BUG_ON(preemptible()); - account_system_vtime(current); + vtime_account(current); current->flags |= PF_VCPU; /* KVM does not hold any references to rcu protected data when it * switches CPU into a guest mode. In fact switching to a guest mode @@ -699,7 +699,7 @@ static inline void kvm_guest_enter(void) static inline void kvm_guest_exit(void) { - account_system_vtime(current); + vtime_account(current); current->flags &= ~PF_VCPU; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ba144b121f3d..21e4dcff18f3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1764,7 +1764,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) * Manfred Spraul */ prev_state = prev->state; - account_switch_vtime(prev); + vtime_task_switch(prev); finish_arch_switch(prev); perf_event_task_sched_in(prev, current); finish_lock_switch(rq, prev); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 372692bd5376..53f5b12f2821 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -10,11 +10,11 @@ /* * There are no locks covering percpu hardirq/softirq time. - * They are only modified in account_system_vtime, on corresponding CPU + * They are only modified in vtime_account, on corresponding CPU * with interrupts disabled. So, writes are safe. * They are read and saved off onto struct rq in update_rq_clock(). * This may result in other CPU reading this CPU's irq time and can - * race with irq/account_system_vtime on this CPU. We would either get old + * race with irq/vtime_account on this CPU. We would either get old * or new value with a side effect of accounting a slice of irq time to wrong * task when irq is in progress while we read rq->clock. That is a worthy * compromise in place of having locks on each irq in account_system_time. @@ -43,7 +43,7 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq); * Called before incrementing preempt_count on {soft,}irq_enter * and before decrementing preempt_count on {soft,}irq_exit. */ -void account_system_vtime(struct task_struct *curr) +void vtime_account(struct task_struct *curr) { unsigned long flags; s64 delta; @@ -73,7 +73,7 @@ void account_system_vtime(struct task_struct *curr) irq_time_write_end(); local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(account_system_vtime); +EXPORT_SYMBOL_GPL(vtime_account); static int irqtime_account_hi_update(void) { diff --git a/kernel/softirq.c b/kernel/softirq.c index b73e681df09e..d55e3159f928 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -220,7 +220,7 @@ asmlinkage void __do_softirq(void) current->flags &= ~PF_MEMALLOC; pending = local_softirq_pending(); - account_system_vtime(current); + vtime_account(current); __local_bh_disable((unsigned long)__builtin_return_address(0), SOFTIRQ_OFFSET); @@ -271,7 +271,7 @@ restart: lockdep_softirq_exit(); - account_system_vtime(current); + vtime_account(current); __local_bh_enable(SOFTIRQ_OFFSET); tsk_restore_flags(current, old_flags, PF_MEMALLOC); } @@ -340,7 +340,7 @@ static inline void invoke_softirq(void) */ void irq_exit(void) { - account_system_vtime(current); + vtime_account(current); trace_hardirq_exit(); sub_preempt_count(IRQ_EXIT_OFFSET); if (!in_interrupt() && local_softirq_pending()) -- cgit v1.2.3 From 04e7e951532b390b16feb070be9972b8fad2fc57 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 16 Jul 2012 15:06:40 -0700 Subject: rcu: Switch task's syscall hooks on context switch Clear the syscalls hook of a task when it's scheduled out so that if the task migrates, it doesn't run the syscall slow path on a CPU that might not need it. Also set the syscalls hook on the next task if needed. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- arch/um/drivers/mconsole_kern.c | 1 + include/linux/rcupdate.h | 2 ++ include/linux/sched.h | 8 ++++++++ kernel/rcutree.c | 15 +++++++++++++++ kernel/sched/core.c | 1 + 5 files changed, 27 insertions(+) (limited to 'kernel/sched/core.c') diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 664a60e8dfb4..c17de0db6736 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -705,6 +705,7 @@ static void stack_proc(void *arg) struct task_struct *from = current, *to = arg; to->thread.saved_task = from; + rcu_switch(from, to); switch_to(from, to, from); } diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index f5034f22e94b..7c968e4f929e 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -197,6 +197,8 @@ extern void rcu_user_enter(void); extern void rcu_user_exit(void); extern void rcu_user_enter_after_irq(void); extern void rcu_user_exit_after_irq(void); +extern void rcu_user_hooks_switch(struct task_struct *prev, + struct task_struct *next); #else static inline void rcu_user_enter(void) { } static inline void rcu_user_exit(void) { } diff --git a/include/linux/sched.h b/include/linux/sched.h index 23bddac4bad8..335720a1fc33 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1885,6 +1885,14 @@ static inline void rcu_copy_process(struct task_struct *p) #endif +static inline void rcu_switch(struct task_struct *prev, + struct task_struct *next) +{ +#ifdef CONFIG_RCU_USER_QS + rcu_user_hooks_switch(prev, next); +#endif +} + static inline void tsk_restore_flags(struct task_struct *task, unsigned long orig_flags, unsigned long flags) { diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 6b82a9565149..d2e74c8d4b0e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -717,6 +717,21 @@ int rcu_is_cpu_idle(void) } EXPORT_SYMBOL(rcu_is_cpu_idle); +#ifdef CONFIG_RCU_USER_QS +void rcu_user_hooks_switch(struct task_struct *prev, + struct task_struct *next) +{ + struct rcu_dynticks *rdtp; + + /* Interrupts are disabled in context switch */ + rdtp = &__get_cpu_var(rcu_dynticks); + if (!rdtp->ignore_user_qs) { + clear_tsk_thread_flag(prev, TIF_NOHZ); + set_tsk_thread_flag(next, TIF_NOHZ); + } +} +#endif /* #ifdef CONFIG_RCU_USER_QS */ + #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1a48cdbc8631..ea2213b07d9d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2081,6 +2081,7 @@ context_switch(struct rq *rq, struct task_struct *prev, #endif /* Here we just switch the register state and the stack. */ + rcu_switch(prev, next); switch_to(prev, next, prev); barrier(); -- cgit v1.2.3 From 90a340ed53f0f3bcc3fdf1b2cff56c0e4e911d01 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Jul 2012 20:26:36 +0200 Subject: rcu: Exit RCU extended QS on kernel preemption after irq/exception When an exception or an irq exits, and we are going to resume into interrupted kernel code, the low level architecture code calls preempt_schedule_irq() if there is a need to reschedule. If the interrupt/exception occured between a call to rcu_user_enter() (from syscall exit, exception exit, do_notify_resume exit, ...) and a real resume to userspace (iret,...), preempt_schedule_irq() can be called whereas RCU thinks we are in userspace. But preempt_schedule_irq() is going to run kernel code and may be some RCU read side critical section. We must exit the userspace extended quiescent state before we call it. To solve this, just call rcu_user_exit() in the beginning of preempt_schedule_irq(). Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ea2213b07d9d..4adcd237c545 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3570,6 +3570,7 @@ asmlinkage void __sched preempt_schedule_irq(void) /* Catch callers which need to be fixed */ BUG_ON(ti->preempt_count || !irqs_disabled()); + rcu_user_exit(); do { add_preempt_count(PREEMPT_ACTIVE); local_irq_enable(); -- cgit v1.2.3 From 20ab65e33f469c35f3dabde3445b668aa9c943ee Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Jul 2012 20:26:37 +0200 Subject: rcu: Exit RCU extended QS on user preemption When exceptions or irq are about to resume userspace, if the task needs to be rescheduled, the arch low level code calls schedule() directly. If we call it, it is because we have the TIF_RESCHED flag: - It can be set after random local calls to set_need_resched() (RCU, drm, ...) - A wake up happened and the CPU needs preemption. This can happen in several ways: * Remotely: the remote waking CPU has set TIF_RESCHED and send the wakee an IPI to schedule the new task. * Remotely enqueued: the remote waking CPU sends an IPI to the target and the wake up is made by the target. * Locally: waking CPU == wakee CPU and the wakeup is done locally. set_need_resched() is called without IPI. In the case of local and remotely enqueued wake ups, the tick can be restarted when we enqueue the new task and RCU can exit the extended quiescent state at the same time. Then by the time we reach irq exit path and we call schedule, we are not in RCU user mode. But if we call schedule() only because something called set_need_resched(), RCU may still be in user mode when we reach schedule. Also if a wake up is done remotely, the CPU might see the TIF_RESCHED flag and call schedule while the IPI has not yet happen to restart the tick and exit RCU user mode. We need to manually protect against these corner cases. Create a new API schedule_user() that calls schedule() inside rcu_user_exit()-rcu_user_enter() in order to protect it. Archs will need to rely on it now to implement user preemption safely. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/sched/core.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4adcd237c545..3c4dec0594d6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3469,6 +3469,21 @@ asmlinkage void __sched schedule(void) } EXPORT_SYMBOL(schedule); +#ifdef CONFIG_RCU_USER_QS +asmlinkage void __sched schedule_user(void) +{ + /* + * If we come here after a random call to set_need_resched(), + * or we have been woken up remotely but the IPI has not yet arrived, + * we haven't yet exited the RCU idle mode. Do it here manually until + * we find a better solution. + */ + rcu_user_exit(); + schedule(); + rcu_user_enter(); +} +#endif + /** * schedule_preempt_disabled - called with preemption disabled * -- cgit v1.2.3 From 16a8016372c42c7628eb4a39d75386a461e8c5d0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 1 Jun 2012 14:22:01 -0400 Subject: sanitize tsk_is_polling() Make default just return 0. The current default (checking TIF_POLLING_NRFLAG) is taken to architectures that need it; ones that don't do polling in their idle threads don't need to defined TIF_POLLING_NRFLAG at all. ia64 defined both TS_POLLING (used by its tsk_is_polling()) and TIF_POLLING_NRFLAG (not used at all). Killed the latter... Signed-off-by: Al Viro --- arch/alpha/include/asm/thread_info.h | 2 ++ arch/arm/include/asm/thread_info.h | 2 -- arch/avr32/include/asm/thread_info.h | 4 ---- arch/blackfin/include/asm/thread_info.h | 3 --- arch/c6x/include/asm/thread_info.h | 1 - arch/c6x/kernel/asm-offsets.c | 1 - arch/cris/include/asm/thread_info.h | 2 -- arch/frv/include/asm/thread_info.h | 2 -- arch/h8300/include/asm/thread_info.h | 3 --- arch/hexagon/include/asm/thread_info.h | 2 -- arch/ia64/include/asm/thread_info.h | 2 -- arch/m32r/include/asm/thread_info.h | 2 -- arch/microblaze/include/asm/thread_info.h | 1 + arch/mips/include/asm/thread_info.h | 2 -- arch/mn10300/include/asm/thread_info.h | 2 ++ arch/openrisc/include/asm/thread_info.h | 2 ++ arch/parisc/include/asm/thread_info.h | 2 ++ arch/powerpc/include/asm/thread_info.h | 2 ++ arch/s390/include/asm/thread_info.h | 3 --- arch/score/include/asm/thread_info.h | 3 --- arch/sh/include/asm/thread_info.h | 3 +++ arch/sparc/include/asm/thread_info_32.h | 2 ++ arch/sparc/include/asm/thread_info_64.h | 3 +++ arch/um/include/asm/thread_info.h | 3 --- arch/xtensa/include/asm/thread_info.h | 2 -- kernel/sched/core.c | 2 +- 26 files changed, 20 insertions(+), 38 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h index 2601c8e338da..4554ecbff7c6 100644 --- a/arch/alpha/include/asm/thread_info.h +++ b/arch/alpha/include/asm/thread_info.h @@ -116,5 +116,7 @@ register struct thread_info *__current_thread_info __asm__("$8"); (int __user *)(value)); \ }) +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) + #endif /* __KERNEL__ */ #endif /* _ALPHA_THREAD_INFO_H */ diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index af7b0bda3355..b2d6b412172d 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -148,7 +148,6 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *, #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_SYSCALL_TRACE 8 #define TIF_SYSCALL_AUDIT 9 -#define TIF_POLLING_NRFLAG 16 #define TIF_USING_IWMMXT 17 #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ #define TIF_RESTORE_SIGMASK 20 @@ -160,7 +159,6 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *, #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) -#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_USING_IWMMXT (1 << TIF_USING_IWMMXT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) diff --git a/arch/avr32/include/asm/thread_info.h b/arch/avr32/include/asm/thread_info.h index a32c88cb2601..dc0c45721499 100644 --- a/arch/avr32/include/asm/thread_info.h +++ b/arch/avr32/include/asm/thread_info.h @@ -77,8 +77,6 @@ static inline struct thread_info *current_thread_info(void) #define TIF_SYSCALL_TRACE 0 /* syscall trace active */ #define TIF_SIGPENDING 1 /* signal pending */ #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ -#define TIF_POLLING_NRFLAG 3 /* true if poll_idle() is polling - TIF_NEED_RESCHED */ #define TIF_BREAKPOINT 4 /* enter monitor mode on return */ #define TIF_SINGLE_STEP 5 /* single step in progress */ #define TIF_MEMDIE 6 /* is terminating due to OOM killer */ @@ -91,7 +89,6 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) -#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_SINGLE_STEP (1 << TIF_SINGLE_STEP) #define _TIF_MEMDIE (1 << TIF_MEMDIE) #define _TIF_CPU_GOING_TO_SLEEP (1 << TIF_CPU_GOING_TO_SLEEP) @@ -104,7 +101,6 @@ static inline struct thread_info *current_thread_info(void) ((1 << TIF_SIGPENDING) \ | _TIF_NOTIFY_RESUME \ | (1 << TIF_NEED_RESCHED) \ - | (1 << TIF_POLLING_NRFLAG) \ | (1 << TIF_BREAKPOINT) \ | (1 << TIF_RESTORE_SIGMASK)) diff --git a/arch/blackfin/include/asm/thread_info.h b/arch/blackfin/include/asm/thread_info.h index f9a8731b7e81..3894005337ba 100644 --- a/arch/blackfin/include/asm/thread_info.h +++ b/arch/blackfin/include/asm/thread_info.h @@ -96,8 +96,6 @@ static inline struct thread_info *current_thread_info(void) #define TIF_SYSCALL_TRACE 0 /* syscall trace active */ #define TIF_SIGPENDING 1 /* signal pending */ #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ -#define TIF_POLLING_NRFLAG 3 /* true if poll_idle() is polling - TIF_NEED_RESCHED */ #define TIF_MEMDIE 4 /* is terminating due to OOM killer */ #define TIF_RESTORE_SIGMASK 5 /* restore signal mask in do_signal() */ #define TIF_IRQ_SYNC 7 /* sync pipeline stage */ @@ -108,7 +106,6 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_SYSCALL_TRACE (1<status &= ~TS_RESTORE_SIGMASK; return true; } +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) #endif #endif /* __KERNEL__ */ diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h index a75a11abf7b0..8debe9e91754 100644 --- a/arch/mips/include/asm/thread_info.h +++ b/arch/mips/include/asm/thread_info.h @@ -103,7 +103,6 @@ register struct thread_info *__current_thread_info __asm__("$28"); #define TIF_NOTIFY_RESUME 5 /* callback before returning to user */ #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal() */ #define TIF_USEDFPU 16 /* FPU was used by this task this quantum (SMP) */ -#define TIF_POLLING_NRFLAG 17 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ #define TIF_FIXADE 20 /* Fix address errors in software */ #define TIF_LOGADE 21 /* Log address errors to syslog */ @@ -126,7 +125,6 @@ register struct thread_info *__current_thread_info __asm__("$28"); #define _TIF_SECCOMP (1<status &= ~TS_RESTORE_SIGMASK; return true; } + +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) + #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/sparc/include/asm/thread_info_32.h b/arch/sparc/include/asm/thread_info_32.h index dd3807599bb9..25849ae3e900 100644 --- a/arch/sparc/include/asm/thread_info_32.h +++ b/arch/sparc/include/asm/thread_info_32.h @@ -132,6 +132,8 @@ register struct thread_info *current_thread_info_reg asm("g6"); #define _TIF_DO_NOTIFY_RESUME_MASK (_TIF_NOTIFY_RESUME | \ _TIF_SIGPENDING) +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) + #endif /* __KERNEL__ */ #endif /* _ASM_THREAD_INFO_H */ diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h index cfa8c38fb9c8..4e2276631081 100644 --- a/arch/sparc/include/asm/thread_info_64.h +++ b/arch/sparc/include/asm/thread_info_64.h @@ -256,6 +256,9 @@ static inline bool test_and_clear_restore_sigmask(void) ti->status &= ~TS_RESTORE_SIGMASK; return true; } + +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) + #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/um/include/asm/thread_info.h b/arch/um/include/asm/thread_info.h index c04e5ab68f56..2c8eeb2df8b4 100644 --- a/arch/um/include/asm/thread_info.h +++ b/arch/um/include/asm/thread_info.h @@ -65,8 +65,6 @@ static inline struct thread_info *current_thread_info(void) #define TIF_SYSCALL_TRACE 0 /* syscall trace active */ #define TIF_SIGPENDING 1 /* signal pending */ #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ -#define TIF_POLLING_NRFLAG 3 /* true if poll_idle() is polling - * TIF_NEED_RESCHED */ #define TIF_RESTART_BLOCK 4 #define TIF_MEMDIE 5 /* is terminating due to OOM killer */ #define TIF_SYSCALL_AUDIT 6 @@ -76,7 +74,6 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) -#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_MEMDIE (1 << TIF_MEMDIE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) diff --git a/arch/xtensa/include/asm/thread_info.h b/arch/xtensa/include/asm/thread_info.h index a27bc8ce5eff..214be732b2b2 100644 --- a/arch/xtensa/include/asm/thread_info.h +++ b/arch/xtensa/include/asm/thread_info.h @@ -132,14 +132,12 @@ static inline struct thread_info *current_thread_info(void) #define TIF_MEMDIE 5 /* is terminating due to OOM killer */ #define TIF_RESTORE_SIGMASK 6 /* restore signal mask in do_signal() */ #define TIF_NOTIFY_RESUME 7 /* callback before returning to user */ -#define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define _TIF_SYSCALL_TRACE (1< Date: Tue, 25 Sep 2012 21:12:30 +0800 Subject: sched: Ensure 'sched_domains_numa_levels' is safe to use in other functions We should temporarily reset 'sched_domains_numa_levels' to 0 after it is reset to 'level' in sched_init_numa(). If it fails to allocate memory for array sched_domains_numa_masks[][], the array will contain less then 'level' members. This could be dangerous when we use it to iterate array sched_domains_numa_masks[][] in other functions. This patch set sched_domains_numa_levels to 0 before initializing array sched_domains_numa_masks[][], and reset it to 'level' when sched_domains_numa_masks[][] is fully initialized. Signed-off-by: Tang Chen Signed-off-by: Wen Congyang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1348578751-16904-2-git-send-email-tangchen@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c17747236438..f895fdd32c5b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6122,6 +6122,17 @@ static void sched_init_numa(void) * numbers. */ + /* + * Here, we should temporarily reset sched_domains_numa_levels to 0. + * If it fails to allocate memory for array sched_domains_numa_masks[][], + * the array will contain less then 'level' members. This could be + * dangerous when we use it to iterate array sched_domains_numa_masks[][] + * in other functions. + * + * We reset it to 'level' at the end of this function. + */ + sched_domains_numa_levels = 0; + sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); if (!sched_domains_numa_masks) return; @@ -6176,6 +6187,8 @@ static void sched_init_numa(void) } sched_domain_topology = tl; + + sched_domains_numa_levels = level; } #else static inline void sched_init_numa(void) -- cgit v1.2.3 From 301a5cba2887d1f640e6d5184b05a6d7132017d5 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 25 Sep 2012 21:12:31 +0800 Subject: sched: Update sched_domains_numa_masks[][] when new cpus are onlined Once array sched_domains_numa_masks[] []is defined, it is never updated. When a new cpu on a new node is onlined, the coincident member in sched_domains_numa_masks[][] is not initialized, and all the masks are 0. As a result, the build_overlap_sched_groups() will initialize a NULL sched_group for the new cpu on the new node, which will lead to kernel panic: [ 3189.403280] Call Trace: [ 3189.403286] [] warn_slowpath_common+0x7f/0xc0 [ 3189.403289] [] warn_slowpath_null+0x1a/0x20 [ 3189.403292] [] build_sched_domains+0x467/0x470 [ 3189.403296] [] partition_sched_domains+0x307/0x510 [ 3189.403299] [] ? partition_sched_domains+0x142/0x510 [ 3189.403305] [] cpuset_update_active_cpus+0x83/0x90 [ 3189.403308] [] cpuset_cpu_active+0x38/0x70 [ 3189.403316] [] notifier_call_chain+0x67/0x150 [ 3189.403320] [] ? native_cpu_up+0x18a/0x1b5 [ 3189.403328] [] __raw_notifier_call_chain+0xe/0x10 [ 3189.403333] [] __cpu_notify+0x20/0x40 [ 3189.403337] [] _cpu_up+0xe9/0x131 [ 3189.403340] [] cpu_up+0xdb/0xee [ 3189.403348] [] store_online+0x9c/0xd0 [ 3189.403355] [] dev_attr_store+0x20/0x30 [ 3189.403361] [] sysfs_write_file+0xa3/0x100 [ 3189.403368] [] vfs_write+0xd0/0x1a0 [ 3189.403371] [] sys_write+0x54/0xa0 [ 3189.403375] [] system_call_fastpath+0x16/0x1b [ 3189.403377] ---[ end trace 1e6cf85d0859c941 ]--- [ 3189.403398] BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 This patch registers a new notifier for cpu hotplug notify chain, and updates sched_domains_numa_masks every time a new cpu is onlined or offlined. Signed-off-by: Tang Chen Signed-off-by: Wen Congyang [ fixed compile warning ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1348578751-16904-3-git-send-email-tangchen@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f895fdd32c5b..8322d73b4392 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6190,10 +6190,65 @@ static void sched_init_numa(void) sched_domains_numa_levels = level; } + +static void sched_domains_numa_masks_set(int cpu) +{ + int i, j; + int node = cpu_to_node(cpu); + + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) { + if (node_distance(j, node) <= sched_domains_numa_distance[i]) + cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); + } + } +} + +static void sched_domains_numa_masks_clear(int cpu) +{ + int i, j; + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) + cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); + } +} + +/* + * Update sched_domains_numa_masks[level][node] array when new cpus + * are onlined. + */ +static int sched_domains_numa_masks_update(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + sched_domains_numa_masks_set(cpu); + break; + + case CPU_DEAD: + sched_domains_numa_masks_clear(cpu); + break; + + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} #else static inline void sched_init_numa(void) { } + +static int sched_domains_numa_masks_update(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + return 0; +} #endif /* CONFIG_NUMA */ static int __sdt_alloc(const struct cpumask *cpu_map) @@ -6642,6 +6697,7 @@ void __init sched_init_smp(void) mutex_unlock(&sched_domains_mutex); put_online_cpus(); + hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); -- cgit v1.2.3