From c1ad348b452aacd784fb97403d03d71723c72ee1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:58 +0000 Subject: tick: Nohz: Rework next timer evaluation The evaluation of the next timer in the nohz code is based on jiffies while all the tick internals are nano seconds based. We have also to convert hrtimer nanoseconds to jiffies in the !highres case. That's just wrong and introduces interesting corner cases. Turn it around and convert the next timer wheel timer expiry and the rcu event to clock monotonic and base all calculations on nanoseconds. That identifies the case where no timer is pending clearly with an absolute expiry value of KTIME_MAX. Makes the code more readable and gets rid of the jiffies magic in the nohz code. Signed-off-by: Thomas Gleixner Reviewed-by: Paul E. McKenney Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: Josh Triplett Cc: Lai Jiangshan Cc: John Stultz Cc: Marcelo Tosatti Link: http://lkml.kernel.org/r/20150414203502.184198593@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/timer.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux/timer.h') diff --git a/include/linux/timer.h b/include/linux/timer.h index 8c5a197e1587..fbb80e0030bf 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -187,13 +187,6 @@ extern void set_timer_slack(struct timer_list *time, int slack_hz); */ #define NEXT_TIMER_MAX_DELTA ((1UL << 30) - 1) -/* - * Return when the next timer-wheel timeout occurs (in absolute jiffies), - * locks the timer base and does the comparison against the given - * jiffie. - */ -extern unsigned long get_next_timer_interrupt(unsigned long now); - /* * Timer-statistics info: */ -- cgit v1.2.3 From 1dabbcec2c0a36fe43509d06499b9e512e70a028 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 26 May 2015 22:50:28 +0000 Subject: timer: Use hlist for the timer wheel hash buckets This reduces the size of struct tvec_base by 50% and results in slightly smaller code as well. Before: struct tvec_base: size: 8256, cachelines: 129 text data bss dec hex filename 17698 13297 8256 39251 9953 ../build/kernel/time/timer.o After: struct tvec_base: 4160, cachelines: 65 text data bss dec hex filename 17491 9201 4160 30852 7884 ../build/kernel/time/timer.o Signed-off-by: Thomas Gleixner Reviewed-by: Viresh Kumar Cc: Peter Zijlstra Cc: Paul McKenney Cc: Frederic Weisbecker Cc: Eric Dumazet Cc: John Stultz Cc: Joonwoo Park Cc: Wenbo Wang Link: http://lkml.kernel.org/r/20150526224511.854731214@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/timer.h | 6 ++--- kernel/time/timer.c | 64 ++++++++++++++++++++++----------------------------- 2 files changed, 30 insertions(+), 40 deletions(-) (limited to 'include/linux/timer.h') diff --git a/include/linux/timer.h b/include/linux/timer.h index fbb80e0030bf..064ee24d3f38 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -14,7 +14,7 @@ struct timer_list { * All fields that change during normal runtime grouped to the * same cacheline */ - struct list_head entry; + struct hlist_node entry; unsigned long expires; struct tvec_base *base; @@ -71,7 +71,7 @@ extern struct tvec_base boot_tvec_bases; #define TIMER_FLAG_MASK 0x3LU #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \ - .entry = { .prev = TIMER_ENTRY_STATIC }, \ + .entry = { .next = TIMER_ENTRY_STATIC }, \ .function = (_function), \ .expires = (_expires), \ .data = (_data), \ @@ -168,7 +168,7 @@ static inline void init_timer_on_stack_key(struct timer_list *timer, */ static inline int timer_pending(const struct timer_list * timer) { - return timer->entry.next != NULL; + return timer->entry.pprev != NULL; } extern void add_timer_on(struct timer_list *timer, int cpu); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index e212df24ad3f..3a5e0c840884 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -70,11 +70,11 @@ EXPORT_SYMBOL(jiffies_64); #define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1)) struct tvec { - struct list_head vec[TVN_SIZE]; + struct hlist_head vec[TVN_SIZE]; }; struct tvec_root { - struct list_head vec[TVR_SIZE]; + struct hlist_head vec[TVR_SIZE]; }; struct tvec_base { @@ -356,7 +356,7 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer) { unsigned long expires = timer->expires; unsigned long idx = expires - base->timer_jiffies; - struct list_head *vec; + struct hlist_head *vec; if (idx < TVR_SIZE) { int i = expires & TVR_MASK; @@ -390,7 +390,7 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer) vec = base->tv5.vec + i; } - list_add(&timer->entry, vec); + hlist_add_head(&timer->entry, vec); } static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) @@ -504,8 +504,8 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state) * statically initialized. We just make sure that it * is tracked in the object tracker. */ - if (timer->entry.next == NULL && - timer->entry.prev == TIMER_ENTRY_STATIC) { + if (timer->entry.pprev == NULL && + timer->entry.next == TIMER_ENTRY_STATIC) { debug_object_init(timer, &timer_debug_descr); debug_object_activate(timer, &timer_debug_descr); return 0; @@ -551,7 +551,7 @@ static int timer_fixup_assert_init(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_NOTAVAILABLE: - if (timer->entry.prev == TIMER_ENTRY_STATIC) { + if (timer->entry.next == TIMER_ENTRY_STATIC) { /* * This is not really a fixup. The timer was * statically initialized. We just make sure that it @@ -655,7 +655,7 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags, { struct tvec_base *base = raw_cpu_read(tvec_bases); - timer->entry.next = NULL; + timer->entry.pprev = NULL; timer->base = (void *)((unsigned long)base | flags); timer->slack = -1; #ifdef CONFIG_TIMER_STATS @@ -687,14 +687,14 @@ EXPORT_SYMBOL(init_timer_key); static inline void detach_timer(struct timer_list *timer, bool clear_pending) { - struct list_head *entry = &timer->entry; + struct hlist_node *entry = &timer->entry; debug_deactivate(timer); - __list_del(entry->prev, entry->next); + __hlist_del(entry); if (clear_pending) - entry->next = NULL; - entry->prev = LIST_POISON2; + entry->pprev = NULL; + entry->next = LIST_POISON2; } static inline void @@ -1095,16 +1095,17 @@ EXPORT_SYMBOL(del_timer_sync); static int cascade(struct tvec_base *base, struct tvec *tv, int index) { /* cascade all the timers from tv up one level */ - struct timer_list *timer, *tmp; - struct list_head tv_list; + struct timer_list *timer; + struct hlist_node *tmp; + struct hlist_head tv_list; - list_replace_init(tv->vec + index, &tv_list); + hlist_move_list(tv->vec + index, &tv_list); /* * We are removing _all_ timers from the list, so we * don't have to detach them individually. */ - list_for_each_entry_safe(timer, tmp, &tv_list, entry) { + hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) { BUG_ON(tbase_get_base(timer->base) != base); /* No accounting, while moving them */ __internal_add_timer(base, timer); @@ -1172,8 +1173,8 @@ static inline void __run_timers(struct tvec_base *base) spin_lock_irq(&base->lock); while (time_after_eq(jiffies, base->timer_jiffies)) { - struct list_head work_list; - struct list_head *head = &work_list; + struct hlist_head work_list; + struct hlist_head *head = &work_list; int index; if (!base->all_timers) { @@ -1192,13 +1193,13 @@ static inline void __run_timers(struct tvec_base *base) !cascade(base, &base->tv4, INDEX(2))) cascade(base, &base->tv5, INDEX(3)); ++base->timer_jiffies; - list_replace_init(base->tv1.vec + index, head); - while (!list_empty(head)) { + hlist_move_list(base->tv1.vec + index, head); + while (!hlist_empty(head)) { void (*fn)(unsigned long); unsigned long data; bool irqsafe; - timer = list_first_entry(head, struct timer_list,entry); + timer = hlist_entry(head->first, struct timer_list, entry); fn = timer->function; data = timer->data; irqsafe = tbase_get_irqsafe(timer->base); @@ -1240,7 +1241,7 @@ static unsigned long __next_timer_interrupt(struct tvec_base *base) /* Look for timer events in tv1. */ index = slot = timer_jiffies & TVR_MASK; do { - list_for_each_entry(nte, base->tv1.vec + slot, entry) { + hlist_for_each_entry(nte, base->tv1.vec + slot, entry) { if (tbase_get_deferrable(nte->base)) continue; @@ -1271,7 +1272,7 @@ cascade: index = slot = timer_jiffies & TVN_MASK; do { - list_for_each_entry(nte, varp->vec + slot, entry) { + hlist_for_each_entry(nte, varp->vec + slot, entry) { if (tbase_get_deferrable(nte->base)) continue; @@ -1530,12 +1531,12 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) EXPORT_SYMBOL(schedule_timeout_uninterruptible); #ifdef CONFIG_HOTPLUG_CPU -static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) +static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) { struct timer_list *timer; - while (!list_empty(head)) { - timer = list_first_entry(head, struct timer_list, entry); + while (!hlist_empty(head)) { + timer = hlist_entry(head->first, struct timer_list, entry); /* We ignore the accounting on the dying cpu */ detach_timer(timer, false); timer_set_base(timer, new_base); @@ -1603,23 +1604,12 @@ static inline void timer_register_cpu_notifier(void) { } static void __init init_timer_cpu(struct tvec_base *base, int cpu) { - int j; - BUG_ON(base != tbase_get_base(base)); base->cpu = cpu; per_cpu(tvec_bases, cpu) = base; spin_lock_init(&base->lock); - for (j = 0; j < TVN_SIZE; j++) { - INIT_LIST_HEAD(base->tv5.vec + j); - INIT_LIST_HEAD(base->tv4.vec + j); - INIT_LIST_HEAD(base->tv3.vec + j); - INIT_LIST_HEAD(base->tv2.vec + j); - } - for (j = 0; j < TVR_SIZE; j++) - INIT_LIST_HEAD(base->tv1.vec + j); - base->timer_jiffies = jiffies; base->next_timer = base->timer_jiffies; } -- cgit v1.2.3 From 0eeda71bc30d74f66f8231f45621d5ace3419186 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 26 May 2015 22:50:29 +0000 Subject: timer: Replace timer base by a cpu index Instead of storing a pointer to the per cpu tvec_base we can simply cache a CPU index in the timer_list and use that to get hold of the correct per cpu tvec_base. This is only used in lock_timer_base() and the slightly larger code is peanuts versus the spinlock operation and the d-cache foot print of the timer wheel. Aside of that this allows to get rid of following nuisances: - boot_tvec_base That statically allocated 4k bss data is just kept around so the timer has a home when it gets statically initialized. It serves no other purpose. With the CPU index we assign the timer to CPU0 at static initialization time and therefor can avoid the whole boot_tvec_base dance. That also simplifies the init code, which just can use the per cpu base. Before: text data bss dec hex filename 17491 9201 4160 30852 7884 ../build/kernel/time/timer.o After: text data bss dec hex filename 17440 9193 0 26633 6809 ../build/kernel/time/timer.o - Overloading the base pointer with various flags The CPU index has enough space to hold the flags (deferrable, irqsafe) so we can get rid of the extra masking and bit fiddling with the base pointer. As a benefit we reduce the size of struct timer_list on 64 bit machines. 4 - 8 bytes, a size reduction up to 15% per struct timer_list, which is a real win as we have tons of them embedded in other structs. This changes also the newly added deferrable printout of the timer start trace point to capture and print all timer->flags, which allows us to decode the target cpu of the timer as well. We might have used bitfields for this, but that would change the static initializers and the init function for no value to accomodate big endian bitfields. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Paul McKenney Cc: Frederic Weisbecker Cc: Eric Dumazet Cc: Viresh Kumar Cc: John Stultz Cc: Joonwoo Park Cc: Wenbo Wang Cc: Steven Rostedt Cc: Badhri Jagan Sridharan Link: http://lkml.kernel.org/r/20150526224511.950084301@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/timer.h | 38 ++++++------- include/trace/events/timer.h | 13 ++--- kernel/time/timer.c | 127 ++++++++++++------------------------------- 3 files changed, 58 insertions(+), 120 deletions(-) (limited to 'include/linux/timer.h') diff --git a/include/linux/timer.h b/include/linux/timer.h index 064ee24d3f38..4a0d52bc2073 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -14,27 +14,23 @@ struct timer_list { * All fields that change during normal runtime grouped to the * same cacheline */ - struct hlist_node entry; - unsigned long expires; - struct tvec_base *base; - - void (*function)(unsigned long); - unsigned long data; - - int slack; + struct hlist_node entry; + unsigned long expires; + void (*function)(unsigned long); + unsigned long data; + u32 flags; + int slack; #ifdef CONFIG_TIMER_STATS - int start_pid; - void *start_site; - char start_comm[16]; + int start_pid; + void *start_site; + char start_comm[16]; #endif #ifdef CONFIG_LOCKDEP - struct lockdep_map lockdep_map; + struct lockdep_map lockdep_map; #endif }; -extern struct tvec_base boot_tvec_bases; - #ifdef CONFIG_LOCKDEP /* * NB: because we have to copy the lockdep_map, setting the lockdep_map key @@ -49,9 +45,6 @@ extern struct tvec_base boot_tvec_bases; #endif /* - * Note that all tvec_bases are at least 4 byte aligned and lower two bits - * of base in timer_list is guaranteed to be zero. Use them for flags. - * * A deferrable timer will work normally when the system is busy, but * will not cause a CPU to come out of idle just to service it; instead, * the timer will be serviced when the CPU eventually wakes up with a @@ -65,17 +58,18 @@ extern struct tvec_base boot_tvec_bases; * workqueue locking issues. It's not meant for executing random crap * with interrupts disabled. Abuse is monitored! */ -#define TIMER_DEFERRABLE 0x1LU -#define TIMER_IRQSAFE 0x2LU - -#define TIMER_FLAG_MASK 0x3LU +#define TIMER_CPUMASK 0x0007FFFF +#define TIMER_MIGRATING 0x00080000 +#define TIMER_BASEMASK (TIMER_CPUMASK | TIMER_MIGRATING) +#define TIMER_DEFERRABLE 0x00100000 +#define TIMER_IRQSAFE 0x00200000 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \ .entry = { .next = TIMER_ENTRY_STATIC }, \ .function = (_function), \ .expires = (_expires), \ .data = (_data), \ - .base = (void *)((unsigned long)&boot_tvec_bases + (_flags)), \ + .flags = (_flags), \ .slack = -1, \ __TIMER_LOCKDEP_MAP_INITIALIZER( \ __FILE__ ":" __stringify(__LINE__)) \ diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index d7abef1fe6e0..073b9ac245ba 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -45,16 +45,16 @@ TRACE_EVENT(timer_start, TP_PROTO(struct timer_list *timer, unsigned long expires, - unsigned int deferrable), + unsigned int flags), - TP_ARGS(timer, expires, deferrable), + TP_ARGS(timer, expires, flags), TP_STRUCT__entry( __field( void *, timer ) __field( void *, function ) __field( unsigned long, expires ) __field( unsigned long, now ) - __field( unsigned int, deferrable ) + __field( unsigned int, flags ) ), TP_fast_assign( @@ -62,13 +62,12 @@ TRACE_EVENT(timer_start, __entry->function = timer->function; __entry->expires = expires; __entry->now = jiffies; - __entry->deferrable = deferrable; + __entry->flags = flags; ), - TP_printk("timer=%p function=%pf expires=%lu [timeout=%ld] defer=%c", + TP_printk("timer=%p function=%pf expires=%lu [timeout=%ld] flags=0x%08x", __entry->timer, __entry->function, __entry->expires, - (long)__entry->expires - __entry->now, - __entry->deferrable > 0 ? 'y':'n') + (long)__entry->expires - __entry->now, __entry->flags) ); /** diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 3a5e0c840884..1540af9f62eb 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -92,43 +92,8 @@ struct tvec_base { struct tvec tv5; } ____cacheline_aligned; -/* - * __TIMER_INITIALIZER() needs to set ->base to a valid pointer (because we've - * made NULL special, hint: lock_timer_base()) and we cannot get a compile time - * pointer to per-cpu entries because we don't know where we'll map the section, - * even for the boot cpu. - * - * And so we use boot_tvec_bases for boot CPU and per-cpu __tvec_bases for the - * rest of them. - */ -struct tvec_base boot_tvec_bases; -EXPORT_SYMBOL(boot_tvec_bases); - -static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; - -/* Functions below help us manage 'deferrable' flag */ -static inline unsigned int tbase_get_deferrable(struct tvec_base *base) -{ - return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE); -} - -static inline unsigned int tbase_get_irqsafe(struct tvec_base *base) -{ - return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE); -} - -static inline struct tvec_base *tbase_get_base(struct tvec_base *base) -{ - return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK)); -} - -static inline void -timer_set_base(struct timer_list *timer, struct tvec_base *new_base) -{ - unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK; - timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags); -} +static DEFINE_PER_CPU(struct tvec_base, tvec_bases); static unsigned long round_jiffies_common(unsigned long j, int cpu, bool force_up) @@ -403,7 +368,7 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) /* * Update base->active_timers and base->next_timer */ - if (!tbase_get_deferrable(timer->base)) { + if (!(timer->flags & TIMER_DEFERRABLE)) { if (!base->active_timers++ || time_before(timer->expires, base->next_timer)) base->next_timer = timer->expires; @@ -422,7 +387,7 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) * require special care against races with idle_cpu(), lets deal * with that later. */ - if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(base->cpu)) + if (!(timer->flags & TIMER_DEFERRABLE) || tick_nohz_full_cpu(base->cpu)) wake_up_nohz_cpu(base->cpu); } @@ -443,7 +408,7 @@ static void timer_stats_account_timer(struct timer_list *timer) if (likely(!timer->start_site)) return; - if (unlikely(tbase_get_deferrable(timer->base))) + if (unlikely(timer->flags & TIMER_DEFERRABLE)) flag |= TIMER_STATS_FLAG_DEFERRABLE; timer_stats_update_stats(timer, timer->start_pid, timer->start_site, @@ -636,7 +601,7 @@ static inline void debug_activate(struct timer_list *timer, unsigned long expires) { debug_timer_activate(timer); - trace_timer_start(timer, expires, tbase_get_deferrable(timer->base)); + trace_timer_start(timer, expires, timer->flags); } static inline void debug_deactivate(struct timer_list *timer) @@ -653,10 +618,8 @@ static inline void debug_assert_init(struct timer_list *timer) static void do_init_timer(struct timer_list *timer, unsigned int flags, const char *name, struct lock_class_key *key) { - struct tvec_base *base = raw_cpu_read(tvec_bases); - timer->entry.pprev = NULL; - timer->base = (void *)((unsigned long)base | flags); + timer->flags = flags | raw_smp_processor_id(); timer->slack = -1; #ifdef CONFIG_TIMER_STATS timer->start_site = NULL; @@ -701,7 +664,7 @@ static inline void detach_expired_timer(struct timer_list *timer, struct tvec_base *base) { detach_timer(timer, true); - if (!tbase_get_deferrable(timer->base)) + if (!(timer->flags & TIMER_DEFERRABLE)) base->active_timers--; base->all_timers--; } @@ -713,7 +676,7 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, return 0; detach_timer(timer, clear_pending); - if (!tbase_get_deferrable(timer->base)) { + if (!(timer->flags & TIMER_DEFERRABLE)) { base->active_timers--; if (timer->expires == base->next_timer) base->next_timer = base->timer_jiffies; @@ -732,24 +695,22 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, * So __run_timers/migrate_timers can safely modify all timers which could * be found on ->tvX lists. * - * When the timer's base is locked, and the timer removed from list, it is - * possible to set timer->base = NULL and drop the lock: the timer remains - * locked. + * When the timer's base is locked and removed from the list, the + * TIMER_MIGRATING flag is set, FIXME */ static struct tvec_base *lock_timer_base(struct timer_list *timer, unsigned long *flags) __acquires(timer->base->lock) { - struct tvec_base *base; - for (;;) { - struct tvec_base *prelock_base = timer->base; - base = tbase_get_base(prelock_base); - if (likely(base != NULL)) { + u32 tf = timer->flags; + struct tvec_base *base; + + if (!(tf & TIMER_MIGRATING)) { + base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK); spin_lock_irqsave(&base->lock, *flags); - if (likely(prelock_base == timer->base)) + if (timer->flags == tf) return base; - /* The timer has migrated to another CPU */ spin_unlock_irqrestore(&base->lock, *flags); } cpu_relax(); @@ -776,7 +737,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, debug_activate(timer, expires); cpu = get_nohz_timer_target(pinned); - new_base = per_cpu(tvec_bases, cpu); + new_base = per_cpu_ptr(&tvec_bases, cpu); if (base != new_base) { /* @@ -788,11 +749,12 @@ __mod_timer(struct timer_list *timer, unsigned long expires, */ if (likely(base->running_timer != timer)) { /* See the comment in lock_timer_base() */ - timer_set_base(timer, NULL); + timer->flags |= TIMER_MIGRATING; + spin_unlock(&base->lock); base = new_base; spin_lock(&base->lock); - timer_set_base(timer, base); + timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; } } @@ -954,13 +916,13 @@ EXPORT_SYMBOL(add_timer); */ void add_timer_on(struct timer_list *timer, int cpu) { - struct tvec_base *base = per_cpu(tvec_bases, cpu); + struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu); unsigned long flags; timer_stats_timer_set_start_info(timer); BUG_ON(timer_pending(timer) || !timer->function); spin_lock_irqsave(&base->lock, flags); - timer_set_base(timer, base); + timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; debug_activate(timer, timer->expires); internal_add_timer(base, timer); spin_unlock_irqrestore(&base->lock, flags); @@ -1025,8 +987,6 @@ int try_to_del_timer_sync(struct timer_list *timer) EXPORT_SYMBOL(try_to_del_timer_sync); #ifdef CONFIG_SMP -static DEFINE_PER_CPU(struct tvec_base, __tvec_bases); - /** * del_timer_sync - deactivate a timer and wait for the handler to finish. * @timer: the timer to be deactivated @@ -1081,7 +1041,7 @@ int del_timer_sync(struct timer_list *timer) * don't use it in hardirq context, because it * could lead to deadlock. */ - WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base)); + WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE)); for (;;) { int ret = try_to_del_timer_sync(timer); if (ret >= 0) @@ -1106,7 +1066,6 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index) * don't have to detach them individually. */ hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) { - BUG_ON(tbase_get_base(timer->base) != base); /* No accounting, while moving them */ __internal_add_timer(base, timer); } @@ -1202,7 +1161,7 @@ static inline void __run_timers(struct tvec_base *base) timer = hlist_entry(head->first, struct timer_list, entry); fn = timer->function; data = timer->data; - irqsafe = tbase_get_irqsafe(timer->base); + irqsafe = timer->flags & TIMER_IRQSAFE; timer_stats_account_timer(timer); @@ -1242,7 +1201,7 @@ static unsigned long __next_timer_interrupt(struct tvec_base *base) index = slot = timer_jiffies & TVR_MASK; do { hlist_for_each_entry(nte, base->tv1.vec + slot, entry) { - if (tbase_get_deferrable(nte->base)) + if (nte->flags & TIMER_DEFERRABLE) continue; found = 1; @@ -1273,7 +1232,7 @@ cascade: index = slot = timer_jiffies & TVN_MASK; do { hlist_for_each_entry(nte, varp->vec + slot, entry) { - if (tbase_get_deferrable(nte->base)) + if (nte->flags & TIMER_DEFERRABLE) continue; found = 1; @@ -1343,7 +1302,7 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) */ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) { - struct tvec_base *base = __this_cpu_read(tvec_bases); + struct tvec_base *base = this_cpu_ptr(&tvec_bases); u64 expires = KTIME_MAX; unsigned long nextevt; @@ -1395,7 +1354,7 @@ void update_process_times(int user_tick) */ static void run_timer_softirq(struct softirq_action *h) { - struct tvec_base *base = __this_cpu_read(tvec_bases); + struct tvec_base *base = this_cpu_ptr(&tvec_bases); if (time_after_eq(jiffies, base->timer_jiffies)) __run_timers(base); @@ -1534,12 +1493,13 @@ EXPORT_SYMBOL(schedule_timeout_uninterruptible); static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) { struct timer_list *timer; + int cpu = new_base->cpu; while (!hlist_empty(head)) { timer = hlist_entry(head->first, struct timer_list, entry); /* We ignore the accounting on the dying cpu */ detach_timer(timer, false); - timer_set_base(timer, new_base); + timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; internal_add_timer(new_base, timer); } } @@ -1551,8 +1511,8 @@ static void migrate_timers(int cpu) int i; BUG_ON(cpu_online(cpu)); - old_base = per_cpu(tvec_bases, cpu); - new_base = get_cpu_var(tvec_bases); + old_base = per_cpu_ptr(&tvec_bases, cpu); + new_base = this_cpu_ptr(&tvec_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. @@ -1576,7 +1536,6 @@ static void migrate_timers(int cpu) spin_unlock(&old_base->lock); spin_unlock_irq(&new_base->lock); - put_cpu_var(tvec_bases); } static int timer_cpu_notify(struct notifier_block *self, @@ -1602,12 +1561,11 @@ static inline void timer_register_cpu_notifier(void) static inline void timer_register_cpu_notifier(void) { } #endif /* CONFIG_HOTPLUG_CPU */ -static void __init init_timer_cpu(struct tvec_base *base, int cpu) +static void __init init_timer_cpu(int cpu) { - BUG_ON(base != tbase_get_base(base)); + struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu); base->cpu = cpu; - per_cpu(tvec_bases, cpu) = base; spin_lock_init(&base->lock); base->timer_jiffies = jiffies; @@ -1616,27 +1574,14 @@ static void __init init_timer_cpu(struct tvec_base *base, int cpu) static void __init init_timer_cpus(void) { - struct tvec_base *base; - int local_cpu = smp_processor_id(); int cpu; - for_each_possible_cpu(cpu) { - if (cpu == local_cpu) - base = &boot_tvec_bases; -#ifdef CONFIG_SMP - else - base = per_cpu_ptr(&__tvec_bases, cpu); -#endif - - init_timer_cpu(base, cpu); - } + for_each_possible_cpu(cpu) + init_timer_cpu(cpu); } void __init init_timers(void) { - /* ensure there are enough low bits for flags in timer->base pointer */ - BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK); - init_timer_cpus(); init_timer_stats(); timer_register_cpu_notifier(); -- cgit v1.2.3 From c74441a17eb975b604e339ca6c11b9ab9aaca11f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 26 May 2015 22:50:31 +0000 Subject: timer: Stats: Simplify the flags handling Simplify the handling of the flag storage for the timer statistics. No intermediate storage anymore. Just hand over the flags field. I left the printout of 'deferrable' for now because changing this would be an ABI update and I have no idea how strong people feel about that. OTOH, I wonder whether we should kill the whole timer stats stuff because all of that information can be retrieved via ftrace/perf as well. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Paul McKenney Cc: Frederic Weisbecker Cc: Eric Dumazet Cc: Viresh Kumar Cc: John Stultz Cc: Joonwoo Park Cc: Wenbo Wang Link: http://lkml.kernel.org/r/20150526224512.046626248@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/timer.h | 5 +---- kernel/time/timer.c | 7 ++----- kernel/time/timer_stats.c | 10 +++++----- 3 files changed, 8 insertions(+), 14 deletions(-) (limited to 'include/linux/timer.h') diff --git a/include/linux/timer.h b/include/linux/timer.h index 4a0d52bc2073..ff0689b6e297 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -188,13 +188,10 @@ extern void set_timer_slack(struct timer_list *time, int slack_hz); extern int timer_stats_active; -#define TIMER_STATS_FLAG_DEFERRABLE 0x1 - extern void init_timer_stats(void); extern void timer_stats_update_stats(void *timer, pid_t pid, void *startf, - void *timerf, char *comm, - unsigned int timer_flag); + void *timerf, char *comm, u32 flags); extern void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 1540af9f62eb..3398d93c74a7 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -404,15 +404,12 @@ void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) static void timer_stats_account_timer(struct timer_list *timer) { - unsigned int flag = 0; - if (likely(!timer->start_site)) return; - if (unlikely(timer->flags & TIMER_DEFERRABLE)) - flag |= TIMER_STATS_FLAG_DEFERRABLE; timer_stats_update_stats(timer, timer->start_pid, timer->start_site, - timer->function, timer->start_comm, flag); + timer->function, timer->start_comm, + timer->flags); } #else diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 1fb08f21302e..1adecb4b87c8 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -68,7 +68,7 @@ struct entry { * Number of timeout events: */ unsigned long count; - unsigned int timer_flag; + u32 flags; /* * We save the command-line string to preserve @@ -227,13 +227,13 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm) * @startf: pointer to the function which did the timer setup * @timerf: pointer to the timer callback function of the timer * @comm: name of the process which set up the timer + * @tflags: The flags field of the timer * * When the timer is already registered, then the event counter is * incremented. Otherwise the timer is registered in a free slot. */ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, - void *timerf, char *comm, - unsigned int timer_flag) + void *timerf, char *comm, u32 tflags) { /* * It doesn't matter which lock we take: @@ -251,7 +251,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, input.start_func = startf; input.expire_func = timerf; input.pid = pid; - input.timer_flag = timer_flag; + input.flags = tflags; raw_spin_lock_irqsave(lock, flags); if (!timer_stats_active) @@ -306,7 +306,7 @@ static int tstats_show(struct seq_file *m, void *v) for (i = 0; i < nr_entries; i++) { entry = entries + i; - if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { + if (entry->flags & TIMER_DEFERRABLE) { seq_printf(m, "%4luD, %5d %-16s ", entry->count, entry->pid, entry->comm); } else { -- cgit v1.2.3 From bc7a34b8b9ebfb0f4b8a35a72a0b134fd6c5ef50 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 26 May 2015 22:50:33 +0000 Subject: timer: Reduce timer migration overhead if disabled Eric reported that the timer_migration sysctl is not really nice performance wise as it needs to check at every timer insertion whether the feature is enabled or not. Further the check does not live in the timer code, so we have an extra function call which checks an extra cache line to figure out that it is disabled. We can do better and store that information in the per cpu (hr)timer bases. I pondered to use a static key, but that's a nightmare to update from the nohz code and the timer base cache line is hot anyway when we select a timer base. The old logic enabled the timer migration unconditionally if CONFIG_NO_HZ was set even if nohz was disabled on the kernel command line. With this modification, we start off with migration disabled. The user visible sysctl is still set to enabled. If the kernel switches to NOHZ migration is enabled, if the user did not disable it via the sysctl prior to the switch. If nohz=off is on the kernel command line, migration stays disabled no matter what. Before: 47.76% hog [.] main 14.84% [kernel] [k] _raw_spin_lock_irqsave 9.55% [kernel] [k] _raw_spin_unlock_irqrestore 6.71% [kernel] [k] mod_timer 6.24% [kernel] [k] lock_timer_base.isra.38 3.76% [kernel] [k] detach_if_pending 3.71% [kernel] [k] del_timer 2.50% [kernel] [k] internal_add_timer 1.51% [kernel] [k] get_nohz_timer_target 1.28% [kernel] [k] __internal_add_timer 0.78% [kernel] [k] timerfn 0.48% [kernel] [k] wake_up_nohz_cpu After: 48.10% hog [.] main 15.25% [kernel] [k] _raw_spin_lock_irqsave 9.76% [kernel] [k] _raw_spin_unlock_irqrestore 6.50% [kernel] [k] mod_timer 6.44% [kernel] [k] lock_timer_base.isra.38 3.87% [kernel] [k] detach_if_pending 3.80% [kernel] [k] del_timer 2.67% [kernel] [k] internal_add_timer 1.33% [kernel] [k] __internal_add_timer 0.73% [kernel] [k] timerfn 0.54% [kernel] [k] wake_up_nohz_cpu Reported-by: Eric Dumazet Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Paul McKenney Cc: Frederic Weisbecker Cc: Viresh Kumar Cc: John Stultz Cc: Joonwoo Park Cc: Wenbo Wang Link: http://lkml.kernel.org/r/20150526224512.127050787@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 2 ++ include/linux/sched.h | 6 +---- include/linux/sched/sysctl.h | 12 --------- include/linux/timer.h | 9 +++++++ kernel/rcu/tree_plugin.h | 2 -- kernel/sched/core.c | 9 +++---- kernel/sysctl.c | 18 +++++++------- kernel/time/hrtimer.c | 35 ++++++++++++++++++++------ kernel/time/tick-internal.h | 14 +++++++++++ kernel/time/tick-sched.c | 25 ++++++++++--------- kernel/time/timer.c | 59 ++++++++++++++++++++++++++++++++++++++++---- kernel/time/timer_list.c | 2 -- 12 files changed, 133 insertions(+), 60 deletions(-) (limited to 'include/linux/timer.h') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 5db055821ef3..69551020bb97 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -163,6 +163,7 @@ enum hrtimer_base_type { * @cpu: cpu number * @active_bases: Bitfield to mark bases with active timers * @clock_was_set_seq: Sequence counter of clock was set events + * @migration_enabled: The migration of hrtimers to other cpus is enabled * @expires_next: absolute time of the next event which was scheduled * via clock_set_next_event() * @next_timer: Pointer to the first expiring timer @@ -186,6 +187,7 @@ struct hrtimer_cpu_base { unsigned int cpu; unsigned int active_bases; unsigned int clock_was_set_seq; + bool migration_enabled; #ifdef CONFIG_HIGH_RES_TIMERS unsigned int in_hrtirq : 1, hres_active : 1, diff --git a/include/linux/sched.h b/include/linux/sched.h index 26a2e6122734..d7151460b0cf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -335,14 +335,10 @@ extern int runqueue_is_locked(int cpu); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) extern void nohz_balance_enter_idle(int cpu); extern void set_cpu_sd_state_idle(void); -extern int get_nohz_timer_target(int pinned); +extern int get_nohz_timer_target(void); #else static inline void nohz_balance_enter_idle(int cpu) { } static inline void set_cpu_sd_state_idle(void) { } -static inline int get_nohz_timer_target(int pinned) -{ - return smp_processor_id(); -} #endif /* diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 596a0e007c62..c9e4731cf10b 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -57,24 +57,12 @@ extern unsigned int sysctl_numa_balancing_scan_size; extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; extern unsigned int sysctl_sched_time_avg; -extern unsigned int sysctl_timer_migration; extern unsigned int sysctl_sched_shares_window; int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); #endif -#ifdef CONFIG_SCHED_DEBUG -static inline unsigned int get_sysctl_timer_migration(void) -{ - return sysctl_timer_migration; -} -#else -static inline unsigned int get_sysctl_timer_migration(void) -{ - return 1; -} -#endif /* * control realtime throttling: diff --git a/include/linux/timer.h b/include/linux/timer.h index ff0689b6e297..61aa61dc410c 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -238,6 +238,15 @@ extern void run_local_timers(void); struct hrtimer; extern enum hrtimer_restart it_real_fn(struct hrtimer *); +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +#include + +extern unsigned int sysctl_timer_migration; +int timer_migration_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); +#endif + unsigned long __round_jiffies(unsigned long j, int cpu); unsigned long __round_jiffies_relative(unsigned long j, int cpu); unsigned long round_jiffies(unsigned long j); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0ef80a0bbabb..d72fa24f2312 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1432,8 +1432,6 @@ module_param(rcu_idle_gp_delay, int, 0644); static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; module_param(rcu_idle_lazy_gp_delay, int, 0644); -extern int tick_nohz_active; - /* * Try to advance callbacks for all flavors of RCU on the current CPU, but * only if it has been awhile since the last time we did so. Afterwards, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ecb7c4216350..e9f25ce70c77 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -572,13 +572,12 @@ void resched_cpu(int cpu) * selecting an idle cpu will add more delays to the timers than intended * (as that cpu's timer base may not be uptodate wrt jiffies etc). */ -int get_nohz_timer_target(int pinned) +int get_nohz_timer_target(void) { - int cpu = smp_processor_id(); - int i; + int i, cpu = smp_processor_id(); struct sched_domain *sd; - if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu)) + if (!idle_cpu(cpu)) return cpu; rcu_read_lock(); @@ -7050,8 +7049,6 @@ void __init sched_init_smp(void) } #endif /* CONFIG_SMP */ -const_debug unsigned int sysctl_timer_migration = 1; - int in_sched_functions(unsigned long addr) { return in_lock_functions(addr) || diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2082b1a88fb9..b13e9d2de302 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -349,15 +349,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "timer_migration", - .data = &sysctl_timer_migration, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, #endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING { @@ -1132,6 +1123,15 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) + { + .procname = "timer_migration", + .data = &sysctl_timer_migration, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = timer_migration_handler, + }, +#endif { } }; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f026413de4d6..6115f4df119b 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -177,6 +177,24 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) #endif } +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +static inline +struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, + int pinned) +{ + if (pinned || !base->migration_enabled) + return this_cpu_ptr(&hrtimer_bases); + return &per_cpu(hrtimer_bases, get_nohz_timer_target()); +} +#else +static inline +struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, + int pinned) +{ + return this_cpu_ptr(&hrtimer_bases); +} +#endif + /* * Switch the timer base to the current CPU when possible. */ @@ -184,14 +202,13 @@ static inline struct hrtimer_clock_base * switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, int pinned) { + struct hrtimer_cpu_base *new_cpu_base, *this_base; struct hrtimer_clock_base *new_base; - struct hrtimer_cpu_base *new_cpu_base; - int this_cpu = smp_processor_id(); - int cpu = get_nohz_timer_target(pinned); int basenum = base->index; + this_base = this_cpu_ptr(&hrtimer_bases); + new_cpu_base = get_target_base(this_base, pinned); again: - new_cpu_base = &per_cpu(hrtimer_bases, cpu); new_base = &new_cpu_base->clock_base[basenum]; if (base != new_base) { @@ -212,17 +229,19 @@ again: raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); - if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { - cpu = this_cpu; + if (new_cpu_base != this_base && + hrtimer_check_target(timer, new_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); + new_cpu_base = this_base; timer->base = base; goto again; } timer->base = new_base; } else { - if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { - cpu = this_cpu; + if (new_cpu_base != this_base && + hrtimer_check_target(timer, new_base)) { + new_cpu_base = this_base; goto again; } } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index ec2208aabdd1..2edde84744df 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -149,4 +149,18 @@ extern void tick_nohz_init(void); static inline void tick_nohz_init(void) { } #endif +#ifdef CONFIG_NO_HZ_COMMON +extern unsigned long tick_nohz_active; +#else +#define tick_nohz_active (0) +#endif + +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +extern void timers_update_migration(void); +#else +static inline void timers_update_migration(void) { } +#endif + +DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); + extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 812f7a3b9898..b1cb01699355 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -399,7 +399,7 @@ void __init tick_nohz_init(void) * NO HZ enabled ? */ static int tick_nohz_enabled __read_mostly = 1; -int tick_nohz_active __read_mostly; +unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ @@ -956,6 +956,16 @@ static void tick_nohz_handler(struct clock_event_device *dev) tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } +static inline void tick_nohz_activate(struct tick_sched *ts, int mode) +{ + if (!tick_nohz_enabled) + return; + ts->nohz_mode = mode; + /* One update is enough */ + if (!test_and_set_bit(0, &tick_nohz_active)) + timers_update_migration(); +} + /** * tick_nohz_switch_to_nohz - switch to nohz mode */ @@ -970,9 +980,6 @@ static void tick_nohz_switch_to_nohz(void) if (tick_switch_to_oneshot(tick_nohz_handler)) return; - tick_nohz_active = 1; - ts->nohz_mode = NOHZ_MODE_LOWRES; - /* * Recycle the hrtimer in ts, so we can share the * hrtimer_forward with the highres code. @@ -984,6 +991,7 @@ static void tick_nohz_switch_to_nohz(void) hrtimer_forward_now(&ts->sched_timer, tick_period); hrtimer_set_expires(&ts->sched_timer, next); tick_program_event(next, 1); + tick_nohz_activate(ts, NOHZ_MODE_LOWRES); } /* @@ -1035,6 +1043,7 @@ static inline void tick_nohz_irq_enter(void) static inline void tick_nohz_switch_to_nohz(void) { } static inline void tick_nohz_irq_enter(void) { } +static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { } #endif /* CONFIG_NO_HZ_COMMON */ @@ -1117,13 +1126,7 @@ void tick_setup_sched_timer(void) hrtimer_forward(&ts->sched_timer, now, tick_period); hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); - -#ifdef CONFIG_NO_HZ_COMMON - if (tick_nohz_enabled) { - ts->nohz_mode = NOHZ_MODE_HIGHRES; - tick_nohz_active = 1; - } -#endif + tick_nohz_activate(ts, NOHZ_MODE_HIGHRES); } #endif /* HIGH_RES_TIMERS */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 3398d93c74a7..343142ed996a 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -85,6 +85,7 @@ struct tvec_base { unsigned long active_timers; unsigned long all_timers; int cpu; + bool migration_enabled; struct tvec_root tv1; struct tvec tv2; struct tvec tv3; @@ -95,6 +96,54 @@ struct tvec_base { static DEFINE_PER_CPU(struct tvec_base, tvec_bases); +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +unsigned int sysctl_timer_migration = 1; + +void timers_update_migration(void) +{ + bool on = sysctl_timer_migration && tick_nohz_active; + unsigned int cpu; + + /* Avoid the loop, if nothing to update */ + if (this_cpu_read(tvec_bases.migration_enabled) == on) + return; + + for_each_possible_cpu(cpu) { + per_cpu(tvec_bases.migration_enabled, cpu) = on; + per_cpu(hrtimer_bases.migration_enabled, cpu) = on; + } +} + +int timer_migration_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + static DEFINE_MUTEX(mutex); + int ret; + + mutex_lock(&mutex); + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (!ret && write) + timers_update_migration(); + mutex_unlock(&mutex); + return ret; +} + +static inline struct tvec_base *get_target_base(struct tvec_base *base, + int pinned) +{ + if (pinned || !base->migration_enabled) + return this_cpu_ptr(&tvec_bases); + return per_cpu_ptr(&tvec_bases, get_nohz_timer_target()); +} +#else +static inline struct tvec_base *get_target_base(struct tvec_base *base, + int pinned) +{ + return this_cpu_ptr(&tvec_bases); +} +#endif + static unsigned long round_jiffies_common(unsigned long j, int cpu, bool force_up) { @@ -716,11 +765,11 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer, static inline int __mod_timer(struct timer_list *timer, unsigned long expires, - bool pending_only, int pinned) + bool pending_only, int pinned) { struct tvec_base *base, *new_base; unsigned long flags; - int ret = 0 , cpu; + int ret = 0; timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); @@ -733,8 +782,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, debug_activate(timer, expires); - cpu = get_nohz_timer_target(pinned); - new_base = per_cpu_ptr(&tvec_bases, cpu); + new_base = get_target_base(base, pinned); if (base != new_base) { /* @@ -751,7 +799,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, spin_unlock(&base->lock); base = new_base; spin_lock(&base->lock); - timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; + timer->flags &= ~TIMER_BASEMASK; + timer->flags |= base->cpu; } } diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 1327004429be..a4536e1e3e2a 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -29,8 +29,6 @@ struct timer_list_iter { typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); -DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); - /* * This allows printing both to /proc/timer_list and * to the console (on SysRq-Q): -- cgit v1.2.3