From cced40229993bb63238299e48a22e4c8d1e13181 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Thu, 17 Nov 2011 23:43:40 +0100 Subject: x86: Use kmemdup() in copy_thread(), rather than duplicating its implementation The semantic patch that makes this change is available in scripts/coccinelle/api/memdup.cocci. Signed-off-by: Thomas Meyer Link: http://lkml.kernel.org/r/1321569820.1624.275.camel@localhost.localdomain Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_64.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel/process_64.c') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3bd7e6eebf31..d2c1f6208d62 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -293,13 +293,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { - p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr, + IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { p->thread.io_bitmap_max = 0; return -ENOMEM; } - memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, - IO_BITMAP_BYTES); set_tsk_thread_flag(p, TIF_IO_BITMAP); } -- cgit v1.2.3 From 280f06774afedf849f0b34248ed6aff57d0f6908 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Oct 2011 18:22:06 +0200 Subject: nohz: Separate out irq exit and idle loop dyntick logic The tick_nohz_stop_sched_tick() function, which tries to delay the next timer tick as long as possible, can be called from two places: - From the idle loop to start the dytick idle mode - From interrupt exit if we have interrupted the dyntick idle mode, so that we reprogram the next tick event in case the irq changed some internal state that requires this action. There are only few minor differences between both that are handled by that function, driven by the ts->inidle cpu variable and the inidle parameter. The whole guarantees that we only update the dyntick mode on irq exit if we actually interrupted the dyntick idle mode, and that we enter in RCU extended quiescent state from idle loop entry only. Split this function into: - tick_nohz_idle_enter(), which sets ts->inidle to 1, enters dynticks idle mode unconditionally if it can, and enters into RCU extended quiescent state. - tick_nohz_irq_exit() which only updates the dynticks idle mode when ts->inidle is set (ie: if tick_nohz_idle_enter() has been called). To maintain symmetry, tick_nohz_restart_sched_tick() has been renamed into tick_nohz_idle_exit(). This simplifies the code and micro-optimize the irq exit path (no need for local_irq_save there). This also prepares for the split between dynticks and rcu extended quiescent state logics. We'll need this split to further fix illegal uses of RCU in extended quiescent states in the idle loop. Signed-off-by: Frederic Weisbecker Cc: Mike Frysinger Cc: Guan Xuetao Cc: David Miller Cc: Chris Metcalf Cc: Hans-Christian Egtvedt Cc: Ralf Baechle Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Russell King Cc: Paul Mackerras Cc: Heiko Carstens Cc: Paul Mundt Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- arch/arm/kernel/process.c | 4 +- arch/avr32/kernel/process.c | 4 +- arch/blackfin/kernel/process.c | 4 +- arch/microblaze/kernel/process.c | 4 +- arch/mips/kernel/process.c | 4 +- arch/openrisc/kernel/idle.c | 4 +- arch/powerpc/kernel/idle.c | 4 +- arch/powerpc/platforms/iseries/setup.c | 8 +-- arch/s390/kernel/process.c | 4 +- arch/sh/kernel/idle.c | 4 +- arch/sparc/kernel/process_64.c | 4 +- arch/tile/kernel/process.c | 4 +- arch/um/kernel/process.c | 4 +- arch/unicore32/kernel/process.c | 4 +- arch/x86/kernel/process_32.c | 4 +- arch/x86/kernel/process_64.c | 4 +- include/linux/tick.h | 13 ++--- kernel/softirq.c | 2 +- kernel/time/tick-sched.c | 93 +++++++++++++++++++++------------- 19 files changed, 99 insertions(+), 77 deletions(-) (limited to 'arch/x86/kernel/process_64.c') diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 3d0c6fb74ae4..3f1f8daf703c 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -183,7 +183,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); leds_event(led_idle_start); while (!need_resched()) { #ifdef CONFIG_HOTPLUG_CPU @@ -213,7 +213,7 @@ void cpu_idle(void) } } leds_event(led_idle_end); - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c index ef5a2a08fcca..6ee7952248db 100644 --- a/arch/avr32/kernel/process.c +++ b/arch/avr32/kernel/process.c @@ -34,10 +34,10 @@ void cpu_idle(void) { /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched()) cpu_idle_sleep(); - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c index 6a80a9e9fc4a..7b141b5c9e8d 100644 --- a/arch/blackfin/kernel/process.c +++ b/arch/blackfin/kernel/process.c @@ -88,10 +88,10 @@ void cpu_idle(void) #endif if (!idle) idle = default_idle; - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched()) idle(); - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index 95cc295976a7..5407f09b4be4 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -103,10 +103,10 @@ void cpu_idle(void) if (!idle) idle = default_idle; - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched()) idle(); - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index c47f96e453c0..c11e5ca2a434 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -56,7 +56,7 @@ void __noreturn cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched() && cpu_online(cpu)) { #ifdef CONFIG_MIPS_MT_SMTC extern void smtc_idle_loop_hook(void); @@ -77,7 +77,7 @@ void __noreturn cpu_idle(void) system_state == SYSTEM_BOOTING)) play_dead(); #endif - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/openrisc/kernel/idle.c b/arch/openrisc/kernel/idle.c index d5bc5f813e89..fb6a9bf40006 100644 --- a/arch/openrisc/kernel/idle.c +++ b/arch/openrisc/kernel/idle.c @@ -51,7 +51,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched()) { check_pgt_cache(); @@ -69,7 +69,7 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); } - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index 39a2baa6ad58..878572f70ac5 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -56,7 +56,7 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched() && !cpu_should_die()) { ppc64_runlatch_off(); @@ -93,7 +93,7 @@ void cpu_idle(void) HMT_medium(); ppc64_runlatch_on(); - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); if (cpu_should_die()) cpu_die(); diff --git a/arch/powerpc/platforms/iseries/setup.c b/arch/powerpc/platforms/iseries/setup.c index ea0acbd8966d..e83dfaf89f69 100644 --- a/arch/powerpc/platforms/iseries/setup.c +++ b/arch/powerpc/platforms/iseries/setup.c @@ -563,7 +563,7 @@ static void yield_shared_processor(void) static void iseries_shared_idle(void) { while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched() && !hvlpevent_is_pending()) { local_irq_disable(); ppc64_runlatch_off(); @@ -577,7 +577,7 @@ static void iseries_shared_idle(void) } ppc64_runlatch_on(); - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); if (hvlpevent_is_pending()) process_iSeries_events(); @@ -593,7 +593,7 @@ static void iseries_dedicated_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); if (!need_resched()) { while (!need_resched()) { ppc64_runlatch_off(); @@ -610,7 +610,7 @@ static void iseries_dedicated_idle(void) } ppc64_runlatch_on(); - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 9451b210a1b4..6224f9dbbc1f 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -91,10 +91,10 @@ static void default_idle(void) void cpu_idle(void) { for (;;) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched()) default_idle(); - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c index db4ecd731a00..6015743020a0 100644 --- a/arch/sh/kernel/idle.c +++ b/arch/sh/kernel/idle.c @@ -89,7 +89,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched()) { check_pgt_cache(); @@ -111,7 +111,7 @@ void cpu_idle(void) start_critical_timings(); } - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 3739a06a76cb..9c2795ba2cfe 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -95,12 +95,12 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while(1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched() && !cpu_is_offline(cpu)) sparc64_yield(cpu); - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c index 9c45d8bbdf57..920e674aedb9 100644 --- a/arch/tile/kernel/process.c +++ b/arch/tile/kernel/process.c @@ -85,7 +85,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched()) { if (cpu_is_offline(cpu)) BUG(); /* no HOTPLUG_CPU */ @@ -105,7 +105,7 @@ void cpu_idle(void) local_irq_enable(); current_thread_info()->status |= TS_POLLING; } - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index c5338351aecd..cfb657e92849 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -246,10 +246,10 @@ void default_idle(void) if (need_resched()) schedule(); - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); nsecs = disable_timer(); idle_sleep(nsecs); - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); } } diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c index ba401df971ed..9999b9a84d46 100644 --- a/arch/unicore32/kernel/process.c +++ b/arch/unicore32/kernel/process.c @@ -55,7 +55,7 @@ void cpu_idle(void) { /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched()) { local_irq_disable(); stop_critical_timings(); @@ -63,7 +63,7 @@ void cpu_idle(void) local_irq_enable(); start_critical_timings(); } - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 795b79f984c2..6d9d4d52cac5 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -99,7 +99,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched()) { check_pgt_cache(); @@ -116,7 +116,7 @@ void cpu_idle(void) pm_idle(); start_critical_timings(); } - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3bd7e6eebf31..b069e9d7875f 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -122,7 +122,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched()) { rmb(); @@ -149,7 +149,7 @@ void cpu_idle(void) __exit_idle(); } - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/include/linux/tick.h b/include/linux/tick.h index ca40838fdfb7..0df1d50a408a 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -121,21 +121,22 @@ static inline int tick_oneshot_mode_active(void) { return 0; } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ # ifdef CONFIG_NO_HZ -extern void tick_nohz_stop_sched_tick(int inidle); -extern void tick_nohz_restart_sched_tick(void); +extern void tick_nohz_idle_enter(void); +extern void tick_nohz_idle_exit(void); +extern void tick_nohz_irq_exit(void); extern ktime_t tick_nohz_get_sleep_length(void); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); # else -static inline void tick_nohz_stop_sched_tick(int inidle) +static inline void tick_nohz_idle_enter(void) { - if (inidle) - rcu_idle_enter(); + rcu_idle_enter(); } -static inline void tick_nohz_restart_sched_tick(void) +static inline void tick_nohz_idle_exit(void) { rcu_idle_exit(); } + static inline ktime_t tick_nohz_get_sleep_length(void) { ktime_t len = { .tv64 = NSEC_PER_SEC/HZ }; diff --git a/kernel/softirq.c b/kernel/softirq.c index 2c71d91efff0..f9f2aa81ce53 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -351,7 +351,7 @@ void irq_exit(void) #ifdef CONFIG_NO_HZ /* Make sure that timer wheel updates are propagated */ if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) - tick_nohz_stop_sched_tick(0); + tick_nohz_irq_exit(); #endif preempt_enable_no_resched(); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 5d9d23665f12..266c242dc354 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -275,42 +275,17 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); -/** - * tick_nohz_stop_sched_tick - stop the idle tick from the idle task - * - * When the next event is more than a tick into the future, stop the idle tick - * Called either from the idle loop or from irq_exit() when an idle period was - * just interrupted by an interrupt which did not cause a reschedule. - */ -void tick_nohz_stop_sched_tick(int inidle) +static void tick_nohz_stop_sched_tick(struct tick_sched *ts) { - unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; - struct tick_sched *ts; + unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; ktime_t last_update, expires, now; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; int cpu; - local_irq_save(flags); - cpu = smp_processor_id(); ts = &per_cpu(tick_cpu_sched, cpu); - /* - * Call to tick_nohz_start_idle stops the last_update_time from being - * updated. Thus, it must not be called in the event we are called from - * irq_exit() with the prior state different than idle. - */ - if (!inidle && !ts->inidle) - goto end; - - /* - * Set ts->inidle unconditionally. Even if the system did not - * switch to NOHZ mode the cpu frequency governers rely on the - * update of the idle time accounting in tick_nohz_start_idle(). - */ - ts->inidle = 1; - now = tick_nohz_start_idle(cpu, ts); /* @@ -326,10 +301,10 @@ void tick_nohz_stop_sched_tick(int inidle) } if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) - goto end; + return; if (need_resched()) - goto end; + return; if (unlikely(local_softirq_pending() && cpu_online(cpu))) { static int ratelimit; @@ -339,7 +314,7 @@ void tick_nohz_stop_sched_tick(int inidle) (unsigned int) local_softirq_pending()); ratelimit++; } - goto end; + return; } ts->idle_calls++; @@ -471,10 +446,54 @@ out: ts->next_jiffies = next_jiffies; ts->last_jiffies = last_jiffies; ts->sleep_length = ktime_sub(dev->next_event, now); -end: - if (inidle) - rcu_idle_enter(); - local_irq_restore(flags); +} + +/** + * tick_nohz_idle_enter - stop the idle tick from the idle task + * + * When the next event is more than a tick into the future, stop the idle tick + * Called when we start the idle loop. + * This also enters into RCU extended quiescent state so that this CPU doesn't + * need anymore to be part of any global grace period completion. This way + * the tick can be stopped safely as we don't need to report quiescent states. + */ +void tick_nohz_idle_enter(void) +{ + struct tick_sched *ts; + + WARN_ON_ONCE(irqs_disabled()); + + local_irq_disable(); + + ts = &__get_cpu_var(tick_cpu_sched); + /* + * set ts->inidle unconditionally. even if the system did not + * switch to nohz mode the cpu frequency governers rely on the + * update of the idle time accounting in tick_nohz_start_idle(). + */ + ts->inidle = 1; + tick_nohz_stop_sched_tick(ts); + rcu_idle_enter(); + + local_irq_enable(); +} + +/** + * tick_nohz_irq_exit - update next tick event from interrupt exit + * + * When an interrupt fires while we are idle and it doesn't cause + * a reschedule, it may still add, modify or delete a timer, enqueue + * an RCU callback, etc... + * So we need to re-calculate and reprogram the next tick event. + */ +void tick_nohz_irq_exit(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + + if (!ts->inidle) + return; + + tick_nohz_stop_sched_tick(ts); } /** @@ -516,11 +535,13 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) } /** - * tick_nohz_restart_sched_tick - restart the idle tick from the idle task + * tick_nohz_idle_exit - restart the idle tick from the idle task * * Restart the idle tick when the CPU is woken up from idle + * This also exit the RCU extended quiescent state. The CPU + * can use RCU again after this function is called. */ -void tick_nohz_restart_sched_tick(void) +void tick_nohz_idle_exit(void) { int cpu = smp_processor_id(); struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); -- cgit v1.2.3 From 2bbb6817c0ac1b5f2a68d720f364f98eeb1ac4fd Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Oct 2011 16:01:00 +0200 Subject: nohz: Allow rcu extended quiescent state handling seperately from tick stop It is assumed that rcu won't be used once we switch to tickless mode and until we restart the tick. However this is not always true, as in x86-64 where we dereference the idle notifiers after the tick is stopped. To prepare for fixing this, add two new APIs: tick_nohz_idle_enter_norcu() and tick_nohz_idle_exit_norcu(). If no use of RCU is made in the idle loop between tick_nohz_enter_idle() and tick_nohz_exit_idle() calls, the arch must instead call the new *_norcu() version such that the arch doesn't need to call rcu_idle_enter() and rcu_idle_exit(). Otherwise the arch must call tick_nohz_enter_idle() and tick_nohz_exit_idle() and also call explicitly: - rcu_idle_enter() after its last use of RCU before the CPU is put to sleep. - rcu_idle_exit() before the first use of RCU after the CPU is woken up. Signed-off-by: Frederic Weisbecker Cc: Mike Frysinger Cc: Guan Xuetao Cc: David Miller Cc: Chris Metcalf Cc: Hans-Christian Egtvedt Cc: Ralf Baechle Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Russell King Cc: Paul Mackerras Cc: Heiko Carstens Cc: Paul Mundt Signed-off-by: Paul E. McKenney --- arch/arm/kernel/process.c | 4 +-- arch/avr32/kernel/process.c | 4 +-- arch/blackfin/kernel/process.c | 4 +-- arch/microblaze/kernel/process.c | 4 +-- arch/mips/kernel/process.c | 4 +-- arch/openrisc/kernel/idle.c | 4 +-- arch/powerpc/kernel/idle.c | 4 +-- arch/powerpc/platforms/iseries/setup.c | 8 +++--- arch/s390/kernel/process.c | 4 +-- arch/sh/kernel/idle.c | 4 +-- arch/sparc/kernel/process_64.c | 4 +-- arch/tile/kernel/process.c | 4 +-- arch/um/kernel/process.c | 4 +-- arch/unicore32/kernel/process.c | 4 +-- arch/x86/kernel/process_32.c | 4 +-- arch/x86/kernel/process_64.c | 4 +-- include/linux/tick.h | 46 +++++++++++++++++++++++++++++++--- kernel/time/tick-sched.c | 25 +++++++++--------- 18 files changed, 90 insertions(+), 49 deletions(-) (limited to 'arch/x86/kernel/process_64.c') diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 3f1f8daf703c..47e34c091276 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -183,7 +183,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); leds_event(led_idle_start); while (!need_resched()) { #ifdef CONFIG_HOTPLUG_CPU @@ -213,7 +213,7 @@ void cpu_idle(void) } } leds_event(led_idle_end); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c index 6ee7952248db..34c8c703bb16 100644 --- a/arch/avr32/kernel/process.c +++ b/arch/avr32/kernel/process.c @@ -34,10 +34,10 @@ void cpu_idle(void) { /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) cpu_idle_sleep(); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c index 7b141b5c9e8d..57e07498a0e7 100644 --- a/arch/blackfin/kernel/process.c +++ b/arch/blackfin/kernel/process.c @@ -88,10 +88,10 @@ void cpu_idle(void) #endif if (!idle) idle = default_idle; - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) idle(); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index 5407f09b4be4..13d59f34b94e 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -103,10 +103,10 @@ void cpu_idle(void) if (!idle) idle = default_idle; - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) idle(); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index c11e5ca2a434..17fb3a270160 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -56,7 +56,7 @@ void __noreturn cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched() && cpu_online(cpu)) { #ifdef CONFIG_MIPS_MT_SMTC extern void smtc_idle_loop_hook(void); @@ -77,7 +77,7 @@ void __noreturn cpu_idle(void) system_state == SYSTEM_BOOTING)) play_dead(); #endif - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/openrisc/kernel/idle.c b/arch/openrisc/kernel/idle.c index fb6a9bf40006..2e82cd0fa5e1 100644 --- a/arch/openrisc/kernel/idle.c +++ b/arch/openrisc/kernel/idle.c @@ -51,7 +51,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) { check_pgt_cache(); @@ -69,7 +69,7 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); } - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index 878572f70ac5..2e782a36d8f2 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -56,7 +56,7 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched() && !cpu_should_die()) { ppc64_runlatch_off(); @@ -93,7 +93,7 @@ void cpu_idle(void) HMT_medium(); ppc64_runlatch_on(); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); if (cpu_should_die()) cpu_die(); diff --git a/arch/powerpc/platforms/iseries/setup.c b/arch/powerpc/platforms/iseries/setup.c index e83dfaf89f69..d69d3d185e89 100644 --- a/arch/powerpc/platforms/iseries/setup.c +++ b/arch/powerpc/platforms/iseries/setup.c @@ -563,7 +563,7 @@ static void yield_shared_processor(void) static void iseries_shared_idle(void) { while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched() && !hvlpevent_is_pending()) { local_irq_disable(); ppc64_runlatch_off(); @@ -577,7 +577,7 @@ static void iseries_shared_idle(void) } ppc64_runlatch_on(); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); if (hvlpevent_is_pending()) process_iSeries_events(); @@ -593,7 +593,7 @@ static void iseries_dedicated_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); if (!need_resched()) { while (!need_resched()) { ppc64_runlatch_off(); @@ -610,7 +610,7 @@ static void iseries_dedicated_idle(void) } ppc64_runlatch_on(); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 6224f9dbbc1f..6fa987367ae6 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -91,10 +91,10 @@ static void default_idle(void) void cpu_idle(void) { for (;;) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) default_idle(); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c index 6015743020a0..ad58e7535a7c 100644 --- a/arch/sh/kernel/idle.c +++ b/arch/sh/kernel/idle.c @@ -89,7 +89,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) { check_pgt_cache(); @@ -111,7 +111,7 @@ void cpu_idle(void) start_critical_timings(); } - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 9c2795ba2cfe..4a0e7d79cb92 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -95,12 +95,12 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while(1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched() && !cpu_is_offline(cpu)) sparc64_yield(cpu); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c index 920e674aedb9..53ac89595ab1 100644 --- a/arch/tile/kernel/process.c +++ b/arch/tile/kernel/process.c @@ -85,7 +85,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) { if (cpu_is_offline(cpu)) BUG(); /* no HOTPLUG_CPU */ @@ -105,7 +105,7 @@ void cpu_idle(void) local_irq_enable(); current_thread_info()->status |= TS_POLLING; } - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index cfb657e92849..55d2cf455f63 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -246,10 +246,10 @@ void default_idle(void) if (need_resched()) schedule(); - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); nsecs = disable_timer(); idle_sleep(nsecs); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); } } diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c index 9999b9a84d46..095ff5a57928 100644 --- a/arch/unicore32/kernel/process.c +++ b/arch/unicore32/kernel/process.c @@ -55,7 +55,7 @@ void cpu_idle(void) { /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) { local_irq_disable(); stop_critical_timings(); @@ -63,7 +63,7 @@ void cpu_idle(void) local_irq_enable(); start_critical_timings(); } - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 6d9d4d52cac5..f94da3920c36 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -99,7 +99,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) { check_pgt_cache(); @@ -116,7 +116,7 @@ void cpu_idle(void) pm_idle(); start_critical_timings(); } - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b069e9d7875f..18e8cf3581f6 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -122,7 +122,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) { rmb(); @@ -149,7 +149,7 @@ void cpu_idle(void) __exit_idle(); } - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/include/linux/tick.h b/include/linux/tick.h index 0df1d50a408a..327434a05757 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -7,6 +7,7 @@ #define _LINUX_TICK_H #include +#include #ifdef CONFIG_GENERIC_CLOCKEVENTS @@ -121,18 +122,57 @@ static inline int tick_oneshot_mode_active(void) { return 0; } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ # ifdef CONFIG_NO_HZ -extern void tick_nohz_idle_enter(void); +extern void __tick_nohz_idle_enter(void); +static inline void tick_nohz_idle_enter(void) +{ + local_irq_disable(); + __tick_nohz_idle_enter(); + local_irq_enable(); +} extern void tick_nohz_idle_exit(void); + +/* + * Call this pair of function if the arch doesn't make any use + * of RCU in-between. You won't need to call rcu_idle_enter() and + * rcu_idle_exit(). + * Otherwise you need to call tick_nohz_idle_enter() and tick_nohz_idle_exit() + * and explicitly tell RCU about the window around the place the CPU enters low + * power mode where no RCU use is made. This is done by calling rcu_idle_enter() + * after the last use of RCU before the CPU is put to sleep and by calling + * rcu_idle_exit() before the first use of RCU after the CPU woke up. + */ +static inline void tick_nohz_idle_enter_norcu(void) +{ + /* + * Also call rcu_idle_enter() in the irq disabled section even + * if it disables irq itself. + * Just an optimization that prevents from an interrupt happening + * between it and __tick_nohz_idle_enter() to lose time to help + * completing a grace period while we could be in extended grace + * period already. + */ + local_irq_disable(); + __tick_nohz_idle_enter(); + rcu_idle_enter(); + local_irq_enable(); +} +static inline void tick_nohz_idle_exit_norcu(void) +{ + rcu_idle_exit(); + tick_nohz_idle_exit(); +} extern void tick_nohz_irq_exit(void); extern ktime_t tick_nohz_get_sleep_length(void); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); # else -static inline void tick_nohz_idle_enter(void) +static inline void tick_nohz_idle_enter(void) { } +static inline void tick_nohz_idle_exit(void) { } +static inline void tick_nohz_idle_enter_norcu(void) { rcu_idle_enter(); } -static inline void tick_nohz_idle_exit(void) +static inline void tick_nohz_idle_exit_norcu(void) { rcu_idle_exit(); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 266c242dc354..c76aefe764b0 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -453,18 +453,22 @@ out: * * When the next event is more than a tick into the future, stop the idle tick * Called when we start the idle loop. - * This also enters into RCU extended quiescent state so that this CPU doesn't - * need anymore to be part of any global grace period completion. This way - * the tick can be stopped safely as we don't need to report quiescent states. + * + * If no use of RCU is made in the idle loop between + * tick_nohz_idle_enter() and tick_nohz_idle_exit() calls, then + * tick_nohz_idle_enter_norcu() should be called instead and the arch + * doesn't need to call rcu_idle_enter() and rcu_idle_exit() explicitly. + * + * Otherwise the arch is responsible of calling: + * + * - rcu_idle_enter() after its last use of RCU before the CPU is put + * to sleep. + * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. */ -void tick_nohz_idle_enter(void) +void __tick_nohz_idle_enter(void) { struct tick_sched *ts; - WARN_ON_ONCE(irqs_disabled()); - - local_irq_disable(); - ts = &__get_cpu_var(tick_cpu_sched); /* * set ts->inidle unconditionally. even if the system did not @@ -473,9 +477,6 @@ void tick_nohz_idle_enter(void) */ ts->inidle = 1; tick_nohz_stop_sched_tick(ts); - rcu_idle_enter(); - - local_irq_enable(); } /** @@ -551,7 +552,7 @@ void tick_nohz_idle_exit(void) ktime_t now; local_irq_disable(); - rcu_idle_exit(); + if (ts->idle_active || (ts->inidle && ts->tick_stopped)) now = ktime_get(); -- cgit v1.2.3 From e37e112de3ac64032df45c2db0dbe1e8f1af86b4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Oct 2011 18:22:08 +0200 Subject: x86: Enter rcu extended qs after idle notifier call The idle notifier, called by enter_idle(), enters into rcu read side critical section but at that time we already switched into the RCU-idle window (rcu_idle_enter() has been called). And it's illegal to use rcu_read_lock() in that state. This results in rcu reporting its bad mood: [ 1.275635] WARNING: at include/linux/rcupdate.h:194 __atomic_notifier_call_chain+0xd2/0x110() [ 1.275635] Hardware name: AMD690VM-FMH [ 1.275635] Modules linked in: [ 1.275635] Pid: 0, comm: swapper Not tainted 3.0.0-rc6+ #252 [ 1.275635] Call Trace: [ 1.275635] [] warn_slowpath_common+0x7a/0xb0 [ 1.275635] [] warn_slowpath_null+0x15/0x20 [ 1.275635] [] __atomic_notifier_call_chain+0xd2/0x110 [ 1.275635] [] atomic_notifier_call_chain+0x11/0x20 [ 1.275635] [] enter_idle+0x20/0x30 [ 1.275635] [] cpu_idle+0xa5/0x110 [ 1.275635] [] rest_init+0xe5/0x140 [ 1.275635] [] ? rest_init+0x48/0x140 [ 1.275635] [] start_kernel+0x3d1/0x3dc [ 1.275635] [] x86_64_start_reservations+0x131/0x135 [ 1.275635] [] x86_64_start_kernel+0xed/0xf4 [ 1.275635] ---[ end trace a22d306b065d4a66 ]--- Fix this by entering rcu extended quiescent state later, just before the CPU goes to sleep. Signed-off-by: Frederic Weisbecker Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- arch/x86/kernel/process_64.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel/process_64.c') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 18e8cf3581f6..64e926c89a6f 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -122,7 +122,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter_norcu(); + tick_nohz_idle_enter(); while (!need_resched()) { rmb(); @@ -139,8 +139,14 @@ void cpu_idle(void) enter_idle(); /* Don't trace irqs off for idle */ stop_critical_timings(); + + /* enter_idle() needs rcu for notifiers */ + rcu_idle_enter(); + if (cpuidle_idle_call()) pm_idle(); + + rcu_idle_exit(); start_critical_timings(); /* In many cases the interrupt that ended idle @@ -149,7 +155,7 @@ void cpu_idle(void) __exit_idle(); } - tick_nohz_idle_exit_norcu(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); -- cgit v1.2.3 From b3b0870ef3ffed72b92415423da864f440f57ad6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 16 Feb 2012 15:45:23 -0800 Subject: i387: do not preload FPU state at task switch time Yes, taking the trap to re-load the FPU/MMX state is expensive, but so is spending several days looking for a bug in the state save/restore code. And the preload code has some rather subtle interactions with both paravirtualization support and segment state restore, so it's not nearly as simple as it should be. Also, now that we no longer necessarily depend on a single bit (ie TS_USEDFPU) for keeping track of the state of the FPU, we migth be able to do better. If we are really switching between two processes that keep touching the FP state, save/restore is inevitable, but in the case of having one process that does most of the FPU usage, we may actually be able to do much better than the preloading. In particular, we may be able to keep track of which CPU the process ran on last, and also per CPU keep track of which process' FP state that CPU has. For modern CPU's that don't destroy the FPU contents on save time, that would allow us to do a lazy restore by just re-enabling the existing FPU state - with no restore cost at all! Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 1 - arch/x86/kernel/process_32.c | 20 -------------------- arch/x86/kernel/process_64.c | 23 ----------------------- arch/x86/kernel/traps.c | 35 +++++++++++------------------------ 4 files changed, 11 insertions(+), 68 deletions(-) (limited to 'arch/x86/kernel/process_64.c') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 548b2c07ac9a..86974c72d0d0 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -30,7 +30,6 @@ extern void fpu_init(void); extern void mxcsr_feature_mask_init(void); extern int init_fpu(struct task_struct *child); extern void math_state_restore(void); -extern void __math_state_restore(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); extern user_regset_active_fn fpregs_active, xfpregs_active; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 485204f58cda..324cd722b447 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -299,23 +299,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) *next = &next_p->thread; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); - bool preload_fpu; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - /* - * If the task has used fpu the last 5 timeslices, just do a full - * restore of the math state immediately to avoid the trap; the - * chances of needing FPU soon are obviously high now - */ - preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; - __unlazy_fpu(prev_p); - /* we're going to use this soon, after a few expensive things */ - if (preload_fpu) - prefetch(next->fpu.state); - /* * Reload esp0. */ @@ -354,11 +342,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) __switch_to_xtra(prev_p, next_p, tss); - /* If we're going to preload the fpu context, make sure clts - is run while we're batching the cpu state updates. */ - if (preload_fpu) - clts(); - /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so @@ -368,9 +351,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ arch_end_context_switch(next_p); - if (preload_fpu) - __math_state_restore(); - /* * Restore %gs if needed (which is common) */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 9b9fe4a85c87..992b4e542bc3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -386,18 +386,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); unsigned fsindex, gsindex; - bool preload_fpu; - - /* - * If the task has used fpu the last 5 timeslices, just do a full - * restore of the math state immediately to avoid the trap; the - * chances of needing FPU soon are obviously high now - */ - preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; - - /* we're going to use this soon, after a few expensive things */ - if (preload_fpu) - prefetch(next->fpu.state); /* * Reload esp0, LDT and the page table pointer: @@ -430,10 +418,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* Must be after DS reload */ __unlazy_fpu(prev_p); - /* Make sure cpu is ready for new context */ - if (preload_fpu) - clts(); - /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so @@ -492,13 +476,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p, tss); - /* - * Preload the FPU context, now that we've determined that the - * task is likely to be using it. - */ - if (preload_fpu) - __math_state_restore(); - return prev_p; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index fc676e44c77f..5afe824c66e5 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -570,28 +570,6 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) { } -/* - * __math_state_restore assumes that cr0.TS is already clear and the - * fpu state is all ready for use. Used during context switch. - */ -void __math_state_restore(void) -{ - struct thread_info *thread = current_thread_info(); - struct task_struct *tsk = thread->task; - - /* - * Paranoid restore. send a SIGSEGV if we fail to restore the state. - */ - if (unlikely(restore_fpu_checking(tsk))) { - stts(); - force_sig(SIGSEGV, tsk); - return; - } - - __thread_set_has_fpu(thread); /* clts in caller! */ - tsk->fpu_counter++; -} - /* * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task @@ -622,9 +600,18 @@ void math_state_restore(void) local_irq_disable(); } - clts(); /* Allow maths ops (or we recurse) */ + __thread_fpu_begin(thread); - __math_state_restore(); + /* + * Paranoid restore. send a SIGSEGV if we fail to restore the state. + */ + if (unlikely(restore_fpu_checking(tsk))) { + __thread_fpu_end(thread); + force_sig(SIGSEGV, tsk); + return; + } + + tsk->fpu_counter++; } EXPORT_SYMBOL_GPL(math_state_restore); -- cgit v1.2.3 From 4903062b5485f0e2c286a23b44c9b59d9b017d53 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 16 Feb 2012 19:11:15 -0800 Subject: i387: move AMD K7/K8 fpu fxsave/fxrstor workaround from save to restore The AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is pending. In order to not leak FIP state from one process to another, we need to do a floating point load after the fxsave of the old process, and before the fxrstor of the new FPU state. That resets the state to the (uninteresting) kernel load, rather than some potentially sensitive user information. We used to do this directly after the FPU state save, but that is actually very inconvenient, since it (a) corrupts what is potentially perfectly good FPU state that we might want to lazy avoid restoring later and (b) on x86-64 it resulted in a very annoying ordering constraint, where "__unlazy_fpu()" in the task switch needs to be delayed until after the DS segment has been reloaded just to get the new DS value. Coupling it to the fxrstor instead of the fxsave automatically avoids both of these issues, and also ensures that we only do it when actually necessary (the FP state after a save may never actually get used). It's simply a much more natural place for the leaked state cleanup. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 19 ------------------- arch/x86/kernel/process_64.c | 5 ++--- arch/x86/kernel/traps.c | 14 ++++++++++++++ 3 files changed, 16 insertions(+), 22 deletions(-) (limited to 'arch/x86/kernel/process_64.c') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 86974c72d0d0..01b115d86770 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -211,15 +211,6 @@ static inline void fpu_fxsave(struct fpu *fpu) #endif /* CONFIG_X86_64 */ -/* We need a safe address that is cheap to find and that is already - in L1 during context switch. The best choices are unfortunately - different for UP and SMP */ -#ifdef CONFIG_SMP -#define safe_address (__per_cpu_offset[0]) -#else -#define safe_address (__get_cpu_var(kernel_cpustat).cpustat[CPUTIME_USER]) -#endif - /* * These must be called with preempt disabled */ @@ -243,16 +234,6 @@ static inline void fpu_save_init(struct fpu *fpu) if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) asm volatile("fnclex"); - - /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception - is pending. Clear the x87 state here by setting it to fixed - values. safe_address is a random variable that should be in L1 */ - alternative_input( - ASM_NOP8 ASM_NOP2, - "emms\n\t" /* clear stack tags */ - "fildl %P[addr]", /* set F?P to defined value */ - X86_FEATURE_FXSAVE_LEAK, - [addr] "m" (safe_address)); } static inline void __save_init_fpu(struct task_struct *tsk) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 992b4e542bc3..753e803f7197 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -387,6 +387,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct tss_struct *tss = &per_cpu(init_tss, cpu); unsigned fsindex, gsindex; + __unlazy_fpu(prev_p); + /* * Reload esp0, LDT and the page table pointer: */ @@ -415,9 +417,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) load_TLS(next, cpu); - /* Must be after DS reload */ - __unlazy_fpu(prev_p); - /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 5afe824c66e5..4d42300dcd2c 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -585,6 +585,10 @@ void math_state_restore(void) struct thread_info *thread = current_thread_info(); struct task_struct *tsk = thread->task; + /* We need a safe address that is cheap to find and that is already + in L1. We just brought in "thread->task", so use that */ +#define safe_address (thread->task) + if (!tsk_used_math(tsk)) { local_irq_enable(); /* @@ -602,6 +606,16 @@ void math_state_restore(void) __thread_fpu_begin(thread); + /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception + is pending. Clear the x87 state here by setting it to fixed + values. safe_address is a random variable that should be in L1 */ + alternative_input( + ASM_NOP8 ASM_NOP2, + "emms\n\t" /* clear stack tags */ + "fildl %P[addr]", /* set F?P to defined value */ + X86_FEATURE_FXSAVE_LEAK, + [addr] "m" (safe_address)); + /* * Paranoid restore. send a SIGSEGV if we fail to restore the state. */ -- cgit v1.2.3 From 34ddc81a230b15c0e345b6b253049db731499f7e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 18 Feb 2012 12:56:35 -0800 Subject: i387: re-introduce FPU state preloading at context switch time After all the FPU state cleanups and finally finding the problem that caused all our FPU save/restore problems, this re-introduces the preloading of FPU state that was removed in commit b3b0870ef3ff ("i387: do not preload FPU state at task switch time"). However, instead of simply reverting the removal, this reimplements preloading with several fixes, most notably - properly abstracted as a true FPU state switch, rather than as open-coded save and restore with various hacks. In particular, implementing it as a proper FPU state switch allows us to optimize the CR0.TS flag accesses: there is no reason to set the TS bit only to then almost immediately clear it again. CR0 accesses are quite slow and expensive, don't flip the bit back and forth for no good reason. - Make sure that the same model works for both x86-32 and x86-64, so that there are no gratuitous differences between the two due to the way they save and restore segment state differently due to architectural differences that really don't matter to the FPU state. - Avoid exposing the "preload" state to the context switch routines, and in particular allow the concept of lazy state restore: if nothing else has used the FPU in the meantime, and the process is still on the same CPU, we can avoid restoring state from memory entirely, just re-expose the state that is still in the FPU unit. That optimized lazy restore isn't actually implemented here, but the infrastructure is set up for it. Of course, older CPU's that use 'fnsave' to save the state cannot take advantage of this, since the state saving also trashes the state. In other words, there is now an actual _design_ to the FPU state saving, rather than just random historical baggage. Hopefully it's easier to follow as a result. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 110 ++++++++++++++++++++++++++++++++++++------- arch/x86/kernel/process_32.c | 5 +- arch/x86/kernel/process_64.c | 5 +- arch/x86/kernel/traps.c | 55 +++++++++++++--------- 4 files changed, 133 insertions(+), 42 deletions(-) (limited to 'arch/x86/kernel/process_64.c') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index f5376676f89c..a850b4d8d14d 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -29,6 +29,7 @@ extern unsigned int sig_xstate_size; extern void fpu_init(void); extern void mxcsr_feature_mask_init(void); extern int init_fpu(struct task_struct *child); +extern void __math_state_restore(struct task_struct *); extern void math_state_restore(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); @@ -212,9 +213,10 @@ static inline void fpu_fxsave(struct fpu *fpu) #endif /* CONFIG_X86_64 */ /* - * These must be called with preempt disabled + * These must be called with preempt disabled. Returns + * 'true' if the FPU state is still intact. */ -static inline void fpu_save_init(struct fpu *fpu) +static inline int fpu_save_init(struct fpu *fpu) { if (use_xsave()) { fpu_xsave(fpu); @@ -223,22 +225,33 @@ static inline void fpu_save_init(struct fpu *fpu) * xsave header may indicate the init state of the FP. */ if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP)) - return; + return 1; } else if (use_fxsr()) { fpu_fxsave(fpu); } else { asm volatile("fnsave %[fx]; fwait" : [fx] "=m" (fpu->state->fsave)); - return; + return 0; } - if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) + /* + * If exceptions are pending, we need to clear them so + * that we don't randomly get exceptions later. + * + * FIXME! Is this perhaps only true for the old-style + * irq13 case? Maybe we could leave the x87 state + * intact otherwise? + */ + if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) { asm volatile("fnclex"); + return 0; + } + return 1; } -static inline void __save_init_fpu(struct task_struct *tsk) +static inline int __save_init_fpu(struct task_struct *tsk) { - fpu_save_init(&tsk->thread.fpu); + return fpu_save_init(&tsk->thread.fpu); } static inline int fpu_fxrstor_checking(struct fpu *fpu) @@ -301,20 +314,79 @@ static inline void __thread_fpu_begin(struct task_struct *tsk) } /* - * Signal frame handlers... + * FPU state switching for scheduling. + * + * This is a two-stage process: + * + * - switch_fpu_prepare() saves the old state and + * sets the new state of the CR0.TS bit. This is + * done within the context of the old process. + * + * - switch_fpu_finish() restores the new state as + * necessary. */ -extern int save_i387_xstate(void __user *buf); -extern int restore_i387_xstate(void __user *buf); +typedef struct { int preload; } fpu_switch_t; + +/* + * FIXME! We could do a totally lazy restore, but we need to + * add a per-cpu "this was the task that last touched the FPU + * on this CPU" variable, and the task needs to have a "I last + * touched the FPU on this CPU" and check them. + * + * We don't do that yet, so "fpu_lazy_restore()" always returns + * false, but some day.. + */ +#define fpu_lazy_restore(tsk) (0) +#define fpu_lazy_state_intact(tsk) do { } while (0) + +static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new) +{ + fpu_switch_t fpu; + + fpu.preload = tsk_used_math(new) && new->fpu_counter > 5; + if (__thread_has_fpu(old)) { + if (__save_init_fpu(old)) + fpu_lazy_state_intact(old); + __thread_clear_has_fpu(old); + old->fpu_counter++; + + /* Don't change CR0.TS if we just switch! */ + if (fpu.preload) { + __thread_set_has_fpu(new); + prefetch(new->thread.fpu.state); + } else + stts(); + } else { + old->fpu_counter = 0; + if (fpu.preload) { + if (fpu_lazy_restore(new)) + fpu.preload = 0; + else + prefetch(new->thread.fpu.state); + __thread_fpu_begin(new); + } + } + return fpu; +} -static inline void __unlazy_fpu(struct task_struct *tsk) +/* + * By the time this gets called, we've already cleared CR0.TS and + * given the process the FPU if we are going to preload the FPU + * state - all we need to do is to conditionally restore the register + * state itself. + */ +static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu) { - if (__thread_has_fpu(tsk)) { - __save_init_fpu(tsk); - __thread_fpu_end(tsk); - } else - tsk->fpu_counter = 0; + if (fpu.preload) + __math_state_restore(new); } +/* + * Signal frame handlers... + */ +extern int save_i387_xstate(void __user *buf); +extern int restore_i387_xstate(void __user *buf); + static inline void __clear_fpu(struct task_struct *tsk) { if (__thread_has_fpu(tsk)) { @@ -474,7 +546,11 @@ static inline void save_init_fpu(struct task_struct *tsk) static inline void unlazy_fpu(struct task_struct *tsk) { preempt_disable(); - __unlazy_fpu(tsk); + if (__thread_has_fpu(tsk)) { + __save_init_fpu(tsk); + __thread_fpu_end(tsk); + } else + tsk->fpu_counter = 0; preempt_enable(); } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 324cd722b447..80bfe1ab0031 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -299,10 +299,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) *next = &next_p->thread; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); + fpu_switch_t fpu; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - __unlazy_fpu(prev_p); + fpu = switch_fpu_prepare(prev_p, next_p); /* * Reload esp0. @@ -357,6 +358,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (prev->gs | next->gs) lazy_load_gs(next->gs); + switch_fpu_finish(next_p, fpu); + percpu_write(current_task, next_p); return prev_p; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 753e803f7197..1fd94bc4279d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -386,8 +386,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); unsigned fsindex, gsindex; + fpu_switch_t fpu; - __unlazy_fpu(prev_p); + fpu = switch_fpu_prepare(prev_p, next_p); /* * Reload esp0, LDT and the page table pointer: @@ -457,6 +458,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) wrmsrl(MSR_KERNEL_GS_BASE, next->gs); prev->gsindex = gsindex; + switch_fpu_finish(next_p, fpu); + /* * Switch the PDA and FPU contexts. */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ad25e51f40c4..77da5b475ad2 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -570,6 +570,37 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) { } +/* + * This gets called with the process already owning the + * FPU state, and with CR0.TS cleared. It just needs to + * restore the FPU register state. + */ +void __math_state_restore(struct task_struct *tsk) +{ + /* We need a safe address that is cheap to find and that is already + in L1. We've just brought in "tsk->thread.has_fpu", so use that */ +#define safe_address (tsk->thread.has_fpu) + + /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception + is pending. Clear the x87 state here by setting it to fixed + values. safe_address is a random variable that should be in L1 */ + alternative_input( + ASM_NOP8 ASM_NOP2, + "emms\n\t" /* clear stack tags */ + "fildl %P[addr]", /* set F?P to defined value */ + X86_FEATURE_FXSAVE_LEAK, + [addr] "m" (safe_address)); + + /* + * Paranoid restore. send a SIGSEGV if we fail to restore the state. + */ + if (unlikely(restore_fpu_checking(tsk))) { + __thread_fpu_end(tsk); + force_sig(SIGSEGV, tsk); + return; + } +} + /* * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task @@ -584,10 +615,6 @@ void math_state_restore(void) { struct task_struct *tsk = current; - /* We need a safe address that is cheap to find and that is already - in L1. We're just bringing in "tsk->thread.has_fpu", so use that */ -#define safe_address (tsk->thread.has_fpu) - if (!tsk_used_math(tsk)) { local_irq_enable(); /* @@ -604,25 +631,7 @@ void math_state_restore(void) } __thread_fpu_begin(tsk); - - /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception - is pending. Clear the x87 state here by setting it to fixed - values. safe_address is a random variable that should be in L1 */ - alternative_input( - ASM_NOP8 ASM_NOP2, - "emms\n\t" /* clear stack tags */ - "fildl %P[addr]", /* set F?P to defined value */ - X86_FEATURE_FXSAVE_LEAK, - [addr] "m" (safe_address)); - - /* - * Paranoid restore. send a SIGSEGV if we fail to restore the state. - */ - if (unlikely(restore_fpu_checking(tsk))) { - __thread_fpu_end(tsk); - force_sig(SIGSEGV, tsk); - return; - } + __math_state_restore(tsk); tsk->fpu_counter++; } -- cgit v1.2.3 From cea20ca3f3181fc36788a15bc65d1062b96a0a6c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 20 Feb 2012 10:24:09 -0800 Subject: i387: fix up some fpu_counter confusion This makes sure we clear the FPU usage counter for newly created tasks, just so that we start off in a known state (for example, don't try to preload the FPU state on the first task switch etc). It also fixes a thinko in when we increment the fpu_counter at task switch time, introduced by commit 34ddc81a230b ("i387: re-introduce FPU state preloading at context switch time"). We should increment the *new* task fpu_counter, not the old task, and only if we decide to use that state (whether lazily or preloaded). Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 3 ++- arch/x86/kernel/process_32.c | 1 + arch/x86/kernel/process_64.c | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel/process_64.c') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index a850b4d8d14d..8df95849721d 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -348,10 +348,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta if (__save_init_fpu(old)) fpu_lazy_state_intact(old); __thread_clear_has_fpu(old); - old->fpu_counter++; /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { + new->fpu_counter++; __thread_set_has_fpu(new); prefetch(new->thread.fpu.state); } else @@ -359,6 +359,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta } else { old->fpu_counter = 0; if (fpu.preload) { + new->fpu_counter++; if (fpu_lazy_restore(new)) fpu.preload = 0; else diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 80bfe1ab0031..bc32761bc27a 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -214,6 +214,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, task_user_gs(p) = get_user_gs(regs); + p->fpu_counter = 0; p->thread.io_bitmap_ptr = NULL; tsk = current; err = -ENOMEM; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 1fd94bc4279d..8ad880b3bc1c 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -286,6 +286,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, set_tsk_thread_flag(p, TIF_FORK); + p->fpu_counter = 0; p->thread.io_bitmap_ptr = NULL; savesegment(gs, p->thread.gsindex); -- cgit v1.2.3 From 7e16838d94b566a17b65231073d179bc04d590c8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 19 Feb 2012 13:27:00 -0800 Subject: i387: support lazy restore of FPU state This makes us recognize when we try to restore FPU state that matches what we already have in the FPU on this CPU, and avoids the restore entirely if so. To do this, we add two new data fields: - a percpu 'fpu_owner_task' variable that gets written any time we update the "has_fpu" field, and thus acts as a kind of back-pointer to the task that owns the CPU. The exception is when we save the FPU state as part of a context switch - if the save can keep the FPU state around, we leave the 'fpu_owner_task' variable pointing at the task whose FP state still remains on the CPU. - a per-thread 'last_cpu' field, that indicates which CPU that thread used its FPU on last. We update this on every context switch (writing an invalid CPU number if the last context switch didn't leave the FPU in a lazily usable state), so we know that *that* thread has done nothing else with the FPU since. These two fields together can be used when next switching back to the task to see if the CPU still matches: if 'fpu_owner_task' matches the task we are switching to, we know that no other task (or kernel FPU usage) touched the FPU on this CPU in the meantime, and if the current CPU number matches the 'last_cpu' field, we know that this thread did no other FP work on any other CPU, so the FPU state on the CPU must match what was saved on last context switch. In that case, we can avoid the 'f[x]rstor' entirely, and just clear the CR0.TS bit. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 35 +++++++++++++++++++++++------------ arch/x86/include/asm/processor.h | 3 ++- arch/x86/kernel/cpu/common.c | 2 ++ arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- 5 files changed, 29 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel/process_64.c') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 74c607b37e87..247904945d3f 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -32,6 +32,8 @@ extern int init_fpu(struct task_struct *child); extern void math_state_restore(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); +DECLARE_PER_CPU(struct task_struct *, fpu_owner_task); + extern user_regset_active_fn fpregs_active, xfpregs_active; extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, xstateregs_get; @@ -276,7 +278,7 @@ static inline int restore_fpu_checking(struct task_struct *tsk) "emms\n\t" /* clear stack tags */ "fildl %P[addr]", /* set F?P to defined value */ X86_FEATURE_FXSAVE_LEAK, - [addr] "m" (tsk->thread.has_fpu)); + [addr] "m" (tsk->thread.fpu.has_fpu)); return fpu_restore_checking(&tsk->thread.fpu); } @@ -288,19 +290,21 @@ static inline int restore_fpu_checking(struct task_struct *tsk) */ static inline int __thread_has_fpu(struct task_struct *tsk) { - return tsk->thread.has_fpu; + return tsk->thread.fpu.has_fpu; } /* Must be paired with an 'stts' after! */ static inline void __thread_clear_has_fpu(struct task_struct *tsk) { - tsk->thread.has_fpu = 0; + tsk->thread.fpu.has_fpu = 0; + percpu_write(fpu_owner_task, NULL); } /* Must be paired with a 'clts' before! */ static inline void __thread_set_has_fpu(struct task_struct *tsk) { - tsk->thread.has_fpu = 1; + tsk->thread.fpu.has_fpu = 1; + percpu_write(fpu_owner_task, tsk); } /* @@ -345,18 +349,22 @@ typedef struct { int preload; } fpu_switch_t; * We don't do that yet, so "fpu_lazy_restore()" always returns * false, but some day.. */ -#define fpu_lazy_restore(tsk) (0) -#define fpu_lazy_state_intact(tsk) do { } while (0) +static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) +{ + return new == percpu_read_stable(fpu_owner_task) && + cpu == new->thread.fpu.last_cpu; +} -static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new) +static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu) { fpu_switch_t fpu; fpu.preload = tsk_used_math(new) && new->fpu_counter > 5; if (__thread_has_fpu(old)) { - if (__save_init_fpu(old)) - fpu_lazy_state_intact(old); - __thread_clear_has_fpu(old); + if (!__save_init_fpu(old)) + cpu = ~0; + old->thread.fpu.last_cpu = cpu; + old->thread.fpu.has_fpu = 0; /* But leave fpu_owner_task! */ /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { @@ -367,9 +375,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta stts(); } else { old->fpu_counter = 0; + old->thread.fpu.last_cpu = ~0; if (fpu.preload) { new->fpu_counter++; - if (fpu_lazy_restore(new)) + if (fpu_lazy_restore(new, cpu)) fpu.preload = 0; else prefetch(new->thread.fpu.state); @@ -463,8 +472,10 @@ static inline void kernel_fpu_begin(void) __save_init_fpu(me); __thread_clear_has_fpu(me); /* We do 'stts()' in kernel_fpu_end() */ - } else + } else { + percpu_write(fpu_owner_task, NULL); clts(); + } } static inline void kernel_fpu_end(void) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f7c89e231c6c..58545c97d071 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -374,6 +374,8 @@ union thread_xstate { }; struct fpu { + unsigned int last_cpu; + unsigned int has_fpu; union thread_xstate *state; }; @@ -454,7 +456,6 @@ struct thread_struct { unsigned long trap_no; unsigned long error_code; /* floating point and extended processor state */ - unsigned long has_fpu; struct fpu fpu; #ifdef CONFIG_X86_32 /* Virtual 86 mode info */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d43cad74f166..b667148dfad7 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1044,6 +1044,8 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) = DEFINE_PER_CPU(unsigned int, irq_count) = -1; +DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); + /* * Special IST stacks which the CPU switches to when it calls * an IST-marked descriptor entry. Up to 7 stacks (hardware diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index bc32761bc27a..c08d1ff12b7c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -304,7 +304,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - fpu = switch_fpu_prepare(prev_p, next_p); + fpu = switch_fpu_prepare(prev_p, next_p, cpu); /* * Reload esp0. diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 8ad880b3bc1c..cfa5c90c01db 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -389,7 +389,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) unsigned fsindex, gsindex; fpu_switch_t fpu; - fpu = switch_fpu_prepare(prev_p, next_p); + fpu = switch_fpu_prepare(prev_p, next_p, cpu); /* * Reload esp0, LDT and the page table pointer: -- cgit v1.2.3