From 7a14ce1d8c1d3a6118d406e64eaf9aa70375e085 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 15:43:53 +0200 Subject: nohz: reduce jiffies polling overhead Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b854a895591e..cb75394ed00e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -48,6 +48,13 @@ static void tick_do_update_jiffies64(ktime_t now) unsigned long ticks = 0; ktime_t delta; + /* + * Do a quick check without holding xtime_lock: + */ + delta = ktime_sub(now, last_jiffies_update); + if (delta.tv64 < tick_period.tv64) + return; + /* Reevalute with xtime_lock held */ write_seqlock(&xtime_lock); -- cgit v1.2.3 From aa276e1cafb3ce9d01d1e837bcd67e92616013ac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 9 Jun 2008 19:15:00 +0200 Subject: x86, clockevents: add C1E aware idle function C1E on AMD machines is like C3 but without control from the OS. Up to now we disabled the local apic timer for those machines as it stops when the CPU goes into C1E. This excludes those machines from high resolution timers / dynamic ticks, which hurts especially X2 based laptops. The current boot time C1E detection has another, more serious flaw as well: some BIOSes do not enable C1E until the ACPI processor module is loaded. This causes systems to stop working after that point. To work nicely with C1E enabled machines we use a separate idle function, which checks on idle entry whether C1E was enabled in the Interrupt Pending Message MSR. This allows us to do timer broadcasting for C1E and covers the late enablement of C1E as well. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 5 ++-- arch/x86/kernel/apic_64.c | 26 +---------------- arch/x86/kernel/cpu/amd.c | 30 -------------------- arch/x86/kernel/cpu/amd_64.c | 25 ----------------- arch/x86/kernel/process.c | 66 ++++++++++++++++++++++++++++++++++++++++++++ include/asm-x86/apic.h | 3 -- kernel/time/tick-broadcast.c | 6 ++-- 7 files changed, 73 insertions(+), 88 deletions(-) (limited to 'kernel/time') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 4b99b1bdeb6c..c44206e731d4 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -64,9 +64,8 @@ static int enable_local_apic __initdata; /* Local APIC timer verification ok */ static int local_apic_timer_verify_ok; -/* Disable local APIC timer from the kernel commandline or via dmi quirk - or using CPU MSR check */ -int local_apic_timer_disabled; +/* Disable local APIC timer from the kernel commandline or via dmi quirk */ +static int local_apic_timer_disabled; /* Local APIC timer works in C2 */ int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 0633cfd0dc29..a5cc8447cf4d 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -43,7 +43,7 @@ #include #include -int disable_apic_timer __cpuinitdata; +static int disable_apic_timer __cpuinitdata; static int apic_calibrate_pmtmr __initdata; int disable_apic; @@ -422,32 +422,8 @@ void __init setup_boot_APIC_clock(void) setup_APIC_timer(); } -/* - * AMD C1E enabled CPUs have a real nasty problem: Some BIOSes set the - * C1E flag only in the secondary CPU, so when we detect the wreckage - * we already have enabled the boot CPU local apic timer. Check, if - * disable_apic_timer is set and the DUMMY flag is cleared. If yes, - * set the DUMMY flag again and force the broadcast mode in the - * clockevents layer. - */ -static void __cpuinit check_boot_apic_timer_broadcast(void) -{ - if (!disable_apic_timer || - (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY)) - return; - - printk(KERN_INFO "AMD C1E detected late. Force timer broadcast.\n"); - lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY; - - local_irq_enable(); - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, - &boot_cpu_physical_apicid); - local_irq_disable(); -} - void __cpuinit setup_secondary_APIC_clock(void) { - check_boot_apic_timer_broadcast(); setup_APIC_timer(); } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e76b49e7a916..acc891ae5901 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -24,31 +24,6 @@ extern void vide(void); __asm__(".align 4\nvide: ret"); -#ifdef CONFIG_X86_LOCAL_APIC - -/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */ -static __cpuinit int amd_apic_timer_broken(struct cpuinfo_x86 *c) -{ - u32 lo, hi; - - if (c->x86 < 0x0F) - return 0; - - /* Family 0x0f models < rev F do not have this MSR */ - if (c->x86 == 0x0f && c->x86_model < 0x40) - return 0; - - rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); - if (lo & K8_INTP_C1E_ACTIVE_MASK) { - if (smp_processor_id() != boot_cpu_physical_apicid) - printk(KERN_INFO "AMD C1E detected late. " - "Force timer broadcast.\n"); - return 1; - } - return 0; -} -#endif - int force_mwait __cpuinitdata; static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) @@ -285,11 +260,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) num_cache_leaves = 3; } -#ifdef CONFIG_X86_LOCAL_APIC - if (amd_apic_timer_broken(c)) - local_apic_timer_disabled = 1; -#endif - /* K6s reports MCEs but don't actually have all the MSRs */ if (c->x86 < 6) clear_cpu_cap(c, X86_FEATURE_MCE); diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index f5fc161d8f2a..f8d20588bde9 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -110,28 +110,6 @@ static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) #endif } -/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */ -static __cpuinit int amd_apic_timer_broken(struct cpuinfo_x86 *c) -{ - u32 lo, hi; - - if (c->x86 < 0x0F) - return 0; - - /* Family 0x0f models < rev F do not have this MSR */ - if (c->x86 == 0x0f && c->x86_model < 0x40) - return 0; - - rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); - if (lo & K8_INTP_C1E_ACTIVE_MASK) { - if (smp_processor_id() != boot_cpu_physical_apicid) - printk(KERN_INFO "AMD C1E detected late. " - "Force timer broadcast.\n"); - return 1; - } - return 0; -} - void __cpuinit early_init_amd(struct cpuinfo_x86 *c) { early_init_amd_mc(c); @@ -212,9 +190,6 @@ void __cpuinit init_amd(struct cpuinfo_x86 *c) if (c->x86 == 0x10) amd_enable_pci_ext_cfg(c); - if (amd_apic_timer_broken(c)) - disable_apic_timer = 1; - if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { unsigned long long tseg; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 9fea14607dfe..68ad3539b143 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -6,6 +6,7 @@ #include #include #include +#include struct kmem_cache *task_xstate_cachep; @@ -219,6 +220,68 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) return (edx & MWAIT_EDX_C1); } +/* + * Check for AMD CPUs, which have potentially C1E support + */ +static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) +{ + if (c->x86_vendor != X86_VENDOR_AMD) + return 0; + + if (c->x86 < 0x0F) + return 0; + + /* Family 0x0f models < rev F do not have C1E */ + if (c->x86 == 0x0f && c->x86_model < 0x40) + return 0; + + return 1; +} + +/* + * C1E aware idle routine. We check for C1E active in the interrupt + * pending message MSR. If we detect C1E, then we handle it the same + * way as C3 power states (local apic timer and TSC stop) + */ +static void c1e_idle(void) +{ + static cpumask_t c1e_mask = CPU_MASK_NONE; + static int c1e_detected; + + if (need_resched()) + return; + + if (!c1e_detected) { + u32 lo, hi; + + rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); + if (lo & K8_INTP_C1E_ACTIVE_MASK) { + c1e_detected = 1; + mark_tsc_unstable("TSC halt in C1E"); + printk(KERN_INFO "System has C1E enabled\n"); + } + } + + if (c1e_detected) { + int cpu = smp_processor_id(); + + if (!cpu_isset(cpu, c1e_mask)) { + cpu_set(cpu, c1e_mask); + /* Force broadcast so ACPI can not interfere */ + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, + &cpu); + printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", + cpu); + } + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); + default_idle(); + local_irq_disable(); + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); + local_irq_enable(); + } else + default_idle(); +} + void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_SMP @@ -236,6 +299,9 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) */ printk(KERN_INFO "using mwait in idle threads.\n"); pm_idle = mwait_idle; + } else if (check_c1e_idle(c)) { + printk(KERN_INFO "using C1E aware idle routine\n"); + pm_idle = c1e_idle; } else pm_idle = default_idle; } diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h index be9639a9a186..3c387cda95fa 100644 --- a/include/asm-x86/apic.h +++ b/include/asm-x86/apic.h @@ -38,12 +38,9 @@ extern void generic_apic_probe(void); extern int apic_verbosity; extern int timer_over_8254; extern int local_apic_timer_c2_ok; -extern int local_apic_timer_disabled; -extern int apic_runs_main_timer; extern int ioapic_force; extern int disable_apic; -extern int disable_apic_timer; /* * Basic functions accessing APICs. diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 57a1f02e5ec0..67f80c261709 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -30,6 +30,7 @@ struct tick_device tick_broadcast_device; static cpumask_t tick_broadcast_mask; static DEFINE_SPINLOCK(tick_broadcast_lock); +static int tick_broadcast_force; #ifdef CONFIG_TICK_ONESHOT static void tick_broadcast_clear_oneshot(int cpu); @@ -232,10 +233,11 @@ static void tick_do_broadcast_on_off(void *why) CLOCK_EVT_MODE_SHUTDOWN); } if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) - dev->features |= CLOCK_EVT_FEAT_DUMMY; + tick_broadcast_force = 1; break; case CLOCK_EVT_NOTIFY_BROADCAST_OFF: - if (cpu_isset(cpu, tick_broadcast_mask)) { + if (!tick_broadcast_force && + cpu_isset(cpu, tick_broadcast_mask)) { cpu_clear(cpu, tick_broadcast_mask); if (td->mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); -- cgit v1.2.3 From 857f3fd7a496ddf4329345af65a4a2b16dd25fe8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 11 Jul 2008 11:09:22 +0200 Subject: nohz: don't stop idle tick if softirqs are pending. In case a cpu goes idle but softirqs are pending only an error message is printed to the console. It may take a very long time until the pending softirqs will finally be executed. Worst case would be a hanging system. With this patch the timer tick just continues and the softirqs will be executed after the next interrupt. Still a delay but better than a hanging system. Currently we have at least two device drivers on s390 which under certain circumstances schedule a tasklet from process context. This is a reason why we can end up with pending softirqs when going idle. Fixing these drivers seems to be non-trivial. However there is no question that the drivers should be fixed. This patch shouldn't be considered as a bug fix. It just is intended to keep a system running even if device drivers are buggy. Signed-off-by: Heiko Carstens Cc: Jan Glauber Cc: Stefan Weinhuber Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index cb75394ed00e..86baa4f0dfe4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -235,6 +235,7 @@ void tick_nohz_stop_sched_tick(void) local_softirq_pending()); ratelimit++; } + goto end; } ts->idle_calls++; -- cgit v1.2.3 From af52a90a14cdaa54ecbfb6e6982abb13466a4b56 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 7 Jul 2008 14:16:52 -0400 Subject: sched_clock: stop maximum check on NO HZ Working with ftrace I would get large jumps of 11 millisecs or more with the clock tracer. This killed the latencing timings of ftrace and also caused the irqoff self tests to fail. What was happening is with NO_HZ the idle would stop the jiffy counter and before the jiffy counter was updated the sched_clock would have a bad delta jiffies to compare with the gtod with the maximum. The jiffies would stop and the last sched_tick would record the last gtod. On wakeup, the sched clock update would compare the gtod + delta jiffies (which would be zero) and compare it to the TSC. The TSC would have correctly (with a stable TSC) moved forward several jiffies. But because the jiffies has not been updated yet the clock would be prevented from moving forward because it would appear that the TSC jumped too far ahead. The clock would then virtually stop, until the jiffies are updated. Then the next sched clock update would see that the clock was very much behind since the delta jiffies is now correct. This would then jump the clock forward by several jiffies. This caused ftrace to report several milliseconds of interrupts off latency at every resume from NO_HZ idle. This patch adds hooks into the nohz code to disable the checking of the maximum clock update when nohz is in effect. It resumes the max check when nohz has updated the jiffies again. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/sched.h | 17 ++++++++++++++++- kernel/sched_clock.c | 39 ++++++++++++++++++++++++++++++++++++++- kernel/time/tick-sched.c | 2 ++ 3 files changed, 56 insertions(+), 2 deletions(-) (limited to 'kernel/time') diff --git a/include/linux/sched.h b/include/linux/sched.h index c5d3f847ca8d..33a8f42041fa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1573,13 +1573,28 @@ static inline void sched_clock_idle_sleep_event(void) static inline void sched_clock_idle_wakeup_event(u64 delta_ns) { } -#else + +#ifdef CONFIG_NO_HZ +static inline void sched_clock_tick_stop(int cpu) +{ +} + +static inline void sched_clock_tick_start(int cpu) +{ +} +#endif + +#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ extern void sched_clock_init(void); extern u64 sched_clock_cpu(int cpu); extern void sched_clock_tick(void); extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); +#ifdef CONFIG_NO_HZ +extern void sched_clock_tick_stop(int cpu); +extern void sched_clock_tick_start(int cpu); #endif +#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ /* * For kernel-internal use: high-speed (but slightly incorrect) per-cpu diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 42b81fa38cbd..97159e225a77 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -45,6 +45,9 @@ struct sched_clock_data { u64 tick_raw; u64 tick_gtod; u64 clock; +#ifdef CONFIG_NO_HZ + int check_max; +#endif }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); @@ -76,11 +79,45 @@ void sched_clock_init(void) scd->tick_raw = 0; scd->tick_gtod = ktime_now; scd->clock = ktime_now; +#ifdef CONFIG_NO_HZ + scd->check_max = 1; +#endif } sched_clock_running = 1; } +#ifdef CONFIG_NO_HZ +/* + * The dynamic ticks makes the delta jiffies inaccurate. This + * prevents us from checking the maximum time update. + * Disable the maximum check during stopped ticks. + */ +void sched_clock_tick_stop(int cpu) +{ + struct sched_clock_data *scd = cpu_sdc(cpu); + + scd->check_max = 0; +} + +void sched_clock_tick_start(int cpu) +{ + struct sched_clock_data *scd = cpu_sdc(cpu); + + scd->check_max = 1; +} + +static int check_max(struct sched_clock_data *scd) +{ + return scd->check_max; +} +#else +static int check_max(struct sched_clock_data *scd) +{ + return 1; +} +#endif /* CONFIG_NO_HZ */ + /* * update the percpu scd from the raw @now value * @@ -112,7 +149,7 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now) */ max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC; - if (unlikely(clock + delta > max_clock)) { + if (unlikely(clock + delta > max_clock) && check_max(scd)) { if (clock < max_clock) clock = max_clock; else diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b854a895591e..d63008b09a4c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -276,6 +276,7 @@ void tick_nohz_stop_sched_tick(void) ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; rcu_enter_nohz(); + sched_clock_tick_stop(cpu); } /* @@ -375,6 +376,7 @@ void tick_nohz_restart_sched_tick(void) select_nohz_load_balancer(0); now = ktime_get(); tick_do_update_jiffies64(now); + sched_clock_tick_start(cpu); cpu_clear(cpu, nohz_cpu_mask); /* -- cgit v1.2.3