From 362fde0410377e468ca00ad363fdf3e3ec42eb6a Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 11 Mar 2015 21:16:30 -0700 Subject: clocksource: Simplify the logic around clocksource wrapping safety margins The clocksource logic has a number of places where we try to include a safety margin. Most of these are 12% safety margins, but they are inconsistently applied and sometimes are applied on top of each other. Additionally, in the previous patch, we corrected an issue where we unintentionally in effect created a 50% safety margin, which these 12.5% margins where then added to. So to simplify the logic here, this patch removes the various 12.5% margins, and consolidates adding the margin in one place: clocks_calc_max_nsecs(). Additionally, Linus prefers a 50% safety margin, as it allows bad clock values to be more easily caught. This should really have no net effect, due to the corrected issue earlier which caused greater then 50% margins to be used w/o issue. Signed-off-by: John Stultz Acked-by: Stephen Boyd (for the sched_clock.c bit) Cc: Dave Jones Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1426133800-29329-3-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/sched_clock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/time/sched_clock.c') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 01d2d15aa662..3b8ae45020c1 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -125,9 +125,9 @@ void __init sched_clock_register(u64 (*read)(void), int bits, new_mask = CLOCKSOURCE_MASK(bits); - /* calculate how many ns until we wrap */ + /* calculate how many nanosecs until we risk wrapping */ wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask); - new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); + new_wrap_kt = ns_to_ktime(wrap); /* update epoch for new counter and update epoch_ns from old counter*/ new_epoch = read(); -- cgit v1.2.3 From fb82fe2fe8588745edd73aa3a6229facac5c1e15 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 11 Mar 2015 21:16:31 -0700 Subject: clocksource: Add 'max_cycles' to 'struct clocksource' In order to facilitate clocksource validation, add a 'max_cycles' field to the clocksource structure which will hold the maximum cycle value that can safely be multiplied without potentially causing an overflow. Signed-off-by: John Stultz Cc: Dave Jones Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Richard Cochran Cc: Stephen Boyd Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1426133800-29329-4-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- include/linux/clocksource.h | 5 +++-- kernel/time/clocksource.c | 28 ++++++++++++++++------------ kernel/time/sched_clock.c | 2 +- 3 files changed, 20 insertions(+), 15 deletions(-) (limited to 'kernel/time/sched_clock.c') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 9c78d15d33e4..16d048cadebb 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -56,6 +56,7 @@ struct module; * @shift: cycle to nanosecond divisor (power of two) * @max_idle_ns: max idle time permitted by the clocksource (nsecs) * @maxadj: maximum adjustment value to mult (~11%) + * @max_cycles: maximum safe cycle value which won't overflow on multiplication * @flags: flags describing special properties * @archdata: arch-specific data * @suspend: suspend function for the clocksource, if necessary @@ -76,7 +77,7 @@ struct clocksource { #ifdef CONFIG_ARCH_CLOCKSOURCE_DATA struct arch_clocksource_data archdata; #endif - + u64 max_cycles; const char *name; struct list_head list; int rating; @@ -189,7 +190,7 @@ extern struct clocksource * __init clocksource_default_clock(void); extern void clocksource_mark_unstable(struct clocksource *cs); extern u64 -clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask); +clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cycles); extern void clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index ace95763b3a6..fc2a9de43ca1 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -469,11 +469,13 @@ static u32 clocksource_max_adjustment(struct clocksource *cs) * @shift: cycle to nanosecond divisor (power of two) * @maxadj: maximum adjustment value to mult (~11%) * @mask: bitmask for two's complement subtraction of non 64 bit counters + * @max_cyc: maximum cycle value before potential overflow (does not include + * any safety margin) * * NOTE: This function includes a safety margin of 50%, so that bad clock values * can be detected. */ -u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) +u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc) { u64 max_nsecs, max_cycles; @@ -493,6 +495,10 @@ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) max_cycles = min(max_cycles, mask); max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); + /* return the max_cycles value as well if requested */ + if (max_cyc) + *max_cyc = max_cycles; + /* Return 50% of the actual maximum, so we can detect bad values */ max_nsecs >>= 1; @@ -500,17 +506,15 @@ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) } /** - * clocksource_max_deferment - Returns max time the clocksource should be deferred - * @cs: Pointer to clocksource + * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles + * @cs: Pointer to clocksource to be updated * */ -static u64 clocksource_max_deferment(struct clocksource *cs) +static inline void clocksource_update_max_deferment(struct clocksource *cs) { - u64 max_nsecs; - - max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj, - cs->mask); - return max_nsecs; + cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift, + cs->maxadj, cs->mask, + &cs->max_cycles); } #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET @@ -684,7 +688,7 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) cs->maxadj = clocksource_max_adjustment(cs); } - cs->max_idle_ns = clocksource_max_deferment(cs); + clocksource_update_max_deferment(cs); } EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); @@ -730,8 +734,8 @@ int clocksource_register(struct clocksource *cs) "Clocksource %s might overflow on 11%% adjustment\n", cs->name); - /* calculate max idle time permitted for this clocksource */ - cs->max_idle_ns = clocksource_max_deferment(cs); + /* Update max idle time permitted for this clocksource */ + clocksource_update_max_deferment(cs); mutex_lock(&clocksource_mutex); clocksource_enqueue(cs); diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 3b8ae45020c1..ca3bc5c7027c 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -126,7 +126,7 @@ void __init sched_clock_register(u64 (*read)(void), int bits, new_mask = CLOCKSOURCE_MASK(bits); /* calculate how many nanosecs until we risk wrapping */ - wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask); + wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL); new_wrap_kt = ns_to_ktime(wrap); /* update epoch for new counter and update epoch_ns from old counter*/ -- cgit v1.2.3 From 8710e914027e4f64058ebbf0501cc6db3cc8454f Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Thu, 26 Mar 2015 12:23:22 -0700 Subject: timers, sched/clock: Match scope of read and write seqcounts Currently the scope of the raw_write_seqcount_begin/end() in sched_clock_register() far exceeds the scope of the read section in sched_clock(). This gives the impression of safety during cursory review but achieves little. Note that this is likely to be a latent issue at present because sched_clock_register() is typically called before we enable interrupts, however the issue does risk bugs being needlessly introduced as the code evolves. This patch fixes the problem by increasing the scope of the read locking performed by sched_clock() to cover all data modified by sched_clock_register. We also improve clarity by moving writes to struct clock_data that do not impact sched_clock() outside of the critical section. Signed-off-by: Daniel Thompson [ Reworked it slightly to apply to tip/timers/core] Signed-off-by: John Stultz Reviewed-by: Stephen Boyd Acked-by: Peter Zijlstra (Intel) Cc: Catalin Marinas Cc: Peter Zijlstra Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/1427397806-20889-2-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/sched_clock.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'kernel/time/sched_clock.c') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index ca3bc5c7027c..1751e956add9 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -58,23 +58,21 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) unsigned long long notrace sched_clock(void) { - u64 epoch_ns; - u64 epoch_cyc; - u64 cyc; + u64 cyc, res; unsigned long seq; - if (cd.suspended) - return cd.epoch_ns; - do { seq = raw_read_seqcount_begin(&cd.seq); - epoch_cyc = cd.epoch_cyc; - epoch_ns = cd.epoch_ns; + + res = cd.epoch_ns; + if (!cd.suspended) { + cyc = read_sched_clock(); + cyc = (cyc - cd.epoch_cyc) & sched_clock_mask; + res += cyc_to_ns(cyc, cd.mult, cd.shift); + } } while (read_seqcount_retry(&cd.seq, seq)); - cyc = read_sched_clock(); - cyc = (cyc - epoch_cyc) & sched_clock_mask; - return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift); + return res; } /* @@ -111,7 +109,6 @@ void __init sched_clock_register(u64 (*read)(void), int bits, { u64 res, wrap, new_mask, new_epoch, cyc, ns; u32 new_mult, new_shift; - ktime_t new_wrap_kt; unsigned long r; char r_unit; @@ -124,10 +121,11 @@ void __init sched_clock_register(u64 (*read)(void), int bits, clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); new_mask = CLOCKSOURCE_MASK(bits); + cd.rate = rate; /* calculate how many nanosecs until we risk wrapping */ wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL); - new_wrap_kt = ns_to_ktime(wrap); + cd.wrap_kt = ns_to_ktime(wrap); /* update epoch for new counter and update epoch_ns from old counter*/ new_epoch = read(); @@ -138,8 +136,6 @@ void __init sched_clock_register(u64 (*read)(void), int bits, raw_write_seqcount_begin(&cd.seq); read_sched_clock = read; sched_clock_mask = new_mask; - cd.rate = rate; - cd.wrap_kt = new_wrap_kt; cd.mult = new_mult; cd.shift = new_shift; cd.epoch_cyc = new_epoch; -- cgit v1.2.3 From cf7c9c170787d6870af54684822f58acc00a966c Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Thu, 26 Mar 2015 12:23:23 -0700 Subject: timers, sched/clock: Optimize cache line usage Currently sched_clock(), a very hot code path, is not optimized to minimise its cache profile. In particular: 1. cd is not ____cacheline_aligned, 2. struct clock_data does not distinguish between hotpath and coldpath data, reducing locality of reference in the hotpath, 3. Some hotpath data is missing from struct clock_data and is marked __read_mostly (which more or less guarantees it will not share a cache line with cd). This patch corrects these problems by extracting all hotpath data into a separate structure and using ____cacheline_aligned to ensure the hotpath uses a single (64 byte) cache line. Signed-off-by: Daniel Thompson Signed-off-by: John Stultz Reviewed-by: Stephen Boyd Acked-by: Peter Zijlstra (Intel) Cc: Catalin Marinas Cc: Peter Zijlstra Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/1427397806-20889-3-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/sched_clock.c | 112 +++++++++++++++++++++++++++++++--------------- 1 file changed, 77 insertions(+), 35 deletions(-) (limited to 'kernel/time/sched_clock.c') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 1751e956add9..872e0685d1fb 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -18,28 +18,59 @@ #include #include -struct clock_data { - ktime_t wrap_kt; +/** + * struct clock_read_data - data required to read from sched_clock + * + * @epoch_ns: sched_clock value at last update + * @epoch_cyc: Clock cycle value at last update + * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit + * clocks + * @read_sched_clock: Current clock source (or dummy source when suspended) + * @mult: Multipler for scaled math conversion + * @shift: Shift value for scaled math conversion + * @suspended: Flag to indicate if the clock is suspended (stopped) + * + * Care must be taken when updating this structure; it is read by + * some very hot code paths. It occupies <=48 bytes and, when combined + * with the seqcount used to synchronize access, comfortably fits into + * a 64 byte cache line. + */ +struct clock_read_data { u64 epoch_ns; u64 epoch_cyc; - seqcount_t seq; - unsigned long rate; + u64 sched_clock_mask; + u64 (*read_sched_clock)(void); u32 mult; u32 shift; bool suspended; }; +/** + * struct clock_data - all data needed for sched_clock (including + * registration of a new clock source) + * + * @seq: Sequence counter for protecting updates. + * @read_data: Data required to read from sched_clock. + * @wrap_kt: Duration for which clock can run before wrapping + * @rate: Tick rate of the registered clock + * @actual_read_sched_clock: Registered clock read function + * + * The ordering of this structure has been chosen to optimize cache + * performance. In particular seq and read_data (combined) should fit + * into a single 64 byte cache line. + */ +struct clock_data { + seqcount_t seq; + struct clock_read_data read_data; + ktime_t wrap_kt; + unsigned long rate; +}; + static struct hrtimer sched_clock_timer; static int irqtime = -1; core_param(irqtime, irqtime, int, 0400); -static struct clock_data cd = { - .mult = NSEC_PER_SEC / HZ, -}; - -static u64 __read_mostly sched_clock_mask; - static u64 notrace jiffy_sched_clock_read(void) { /* @@ -49,7 +80,10 @@ static u64 notrace jiffy_sched_clock_read(void) return (u64)(jiffies - INITIAL_JIFFIES); } -static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; +static struct clock_data cd ____cacheline_aligned = { + .read_data = { .mult = NSEC_PER_SEC / HZ, + .read_sched_clock = jiffy_sched_clock_read, }, +}; static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) { @@ -60,15 +94,16 @@ unsigned long long notrace sched_clock(void) { u64 cyc, res; unsigned long seq; + struct clock_read_data *rd = &cd.read_data; do { seq = raw_read_seqcount_begin(&cd.seq); - res = cd.epoch_ns; - if (!cd.suspended) { - cyc = read_sched_clock(); - cyc = (cyc - cd.epoch_cyc) & sched_clock_mask; - res += cyc_to_ns(cyc, cd.mult, cd.shift); + res = rd->epoch_ns; + if (!rd->suspended) { + cyc = rd->read_sched_clock(); + cyc = (cyc - rd->epoch_cyc) & rd->sched_clock_mask; + res += cyc_to_ns(cyc, rd->mult, rd->shift); } } while (read_seqcount_retry(&cd.seq, seq)); @@ -83,16 +118,17 @@ static void notrace update_sched_clock(void) unsigned long flags; u64 cyc; u64 ns; + struct clock_read_data *rd = &cd.read_data; - cyc = read_sched_clock(); - ns = cd.epoch_ns + - cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, - cd.mult, cd.shift); + cyc = rd->read_sched_clock(); + ns = rd->epoch_ns + + cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, + rd->mult, rd->shift); raw_local_irq_save(flags); raw_write_seqcount_begin(&cd.seq); - cd.epoch_ns = ns; - cd.epoch_cyc = cyc; + rd->epoch_ns = ns; + rd->epoch_cyc = cyc; raw_write_seqcount_end(&cd.seq); raw_local_irq_restore(flags); } @@ -111,6 +147,7 @@ void __init sched_clock_register(u64 (*read)(void), int bits, u32 new_mult, new_shift; unsigned long r; char r_unit; + struct clock_read_data *rd = &cd.read_data; if (cd.rate > rate) return; @@ -129,17 +166,18 @@ void __init sched_clock_register(u64 (*read)(void), int bits, /* update epoch for new counter and update epoch_ns from old counter*/ new_epoch = read(); - cyc = read_sched_clock(); - ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, - cd.mult, cd.shift); + cyc = rd->read_sched_clock(); + ns = rd->epoch_ns + + cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, + rd->mult, rd->shift); raw_write_seqcount_begin(&cd.seq); - read_sched_clock = read; - sched_clock_mask = new_mask; - cd.mult = new_mult; - cd.shift = new_shift; - cd.epoch_cyc = new_epoch; - cd.epoch_ns = ns; + rd->read_sched_clock = read; + rd->sched_clock_mask = new_mask; + rd->mult = new_mult; + rd->shift = new_shift; + rd->epoch_cyc = new_epoch; + rd->epoch_ns = ns; raw_write_seqcount_end(&cd.seq); r = rate; @@ -171,7 +209,7 @@ void __init sched_clock_postinit(void) * If no sched_clock function has been provided at that point, * make it the final one one. */ - if (read_sched_clock == jiffy_sched_clock_read) + if (cd.read_data.read_sched_clock == jiffy_sched_clock_read) sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); update_sched_clock(); @@ -187,17 +225,21 @@ void __init sched_clock_postinit(void) static int sched_clock_suspend(void) { + struct clock_read_data *rd = &cd.read_data; + update_sched_clock(); hrtimer_cancel(&sched_clock_timer); - cd.suspended = true; + rd->suspended = true; return 0; } static void sched_clock_resume(void) { - cd.epoch_cyc = read_sched_clock(); + struct clock_read_data *rd = &cd.read_data; + + rd->epoch_cyc = rd->read_sched_clock(); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); - cd.suspended = false; + rd->suspended = false; } static struct syscore_ops sched_clock_ops = { -- cgit v1.2.3 From 13dbeb384d2d3aa555ea48d511e8cb110bd172e0 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Thu, 26 Mar 2015 12:23:24 -0700 Subject: timers, sched/clock: Remove suspend from clock_read_data() Currently cd.read_data.suspended is read by the hotpath function sched_clock(). This variable need not be accessed on the hotpath. In fact, once it is removed, we can remove the conditional branches from sched_clock() and install a dummy read_sched_clock function to suspend the clock. The new master copy of the function pointer (actual_read_sched_clock) is introduced and is used for all reads of the clock hardware except those within sched_clock itself. Suggested-by: Thomas Gleixner Signed-off-by: Daniel Thompson Signed-off-by: John Stultz Reviewed-by: Stephen Boyd Acked-by: Peter Zijlstra (Intel) Cc: Catalin Marinas Cc: Peter Zijlstra Cc: Russell King Cc: Will Deacon Link: http://lkml.kernel.org/r/1427397806-20889-4-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/sched_clock.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) (limited to 'kernel/time/sched_clock.c') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 872e0685d1fb..52ea5d976393 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -28,10 +28,9 @@ * @read_sched_clock: Current clock source (or dummy source when suspended) * @mult: Multipler for scaled math conversion * @shift: Shift value for scaled math conversion - * @suspended: Flag to indicate if the clock is suspended (stopped) * * Care must be taken when updating this structure; it is read by - * some very hot code paths. It occupies <=48 bytes and, when combined + * some very hot code paths. It occupies <=40 bytes and, when combined * with the seqcount used to synchronize access, comfortably fits into * a 64 byte cache line. */ @@ -42,7 +41,6 @@ struct clock_read_data { u64 (*read_sched_clock)(void); u32 mult; u32 shift; - bool suspended; }; /** @@ -64,6 +62,7 @@ struct clock_data { struct clock_read_data read_data; ktime_t wrap_kt; unsigned long rate; + u64 (*actual_read_sched_clock)(void); }; static struct hrtimer sched_clock_timer; @@ -83,6 +82,8 @@ static u64 notrace jiffy_sched_clock_read(void) static struct clock_data cd ____cacheline_aligned = { .read_data = { .mult = NSEC_PER_SEC / HZ, .read_sched_clock = jiffy_sched_clock_read, }, + .actual_read_sched_clock = jiffy_sched_clock_read, + }; static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) @@ -99,12 +100,9 @@ unsigned long long notrace sched_clock(void) do { seq = raw_read_seqcount_begin(&cd.seq); - res = rd->epoch_ns; - if (!rd->suspended) { - cyc = rd->read_sched_clock(); - cyc = (cyc - rd->epoch_cyc) & rd->sched_clock_mask; - res += cyc_to_ns(cyc, rd->mult, rd->shift); - } + cyc = (rd->read_sched_clock() - rd->epoch_cyc) & + rd->sched_clock_mask; + res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift); } while (read_seqcount_retry(&cd.seq, seq)); return res; @@ -120,7 +118,7 @@ static void notrace update_sched_clock(void) u64 ns; struct clock_read_data *rd = &cd.read_data; - cyc = rd->read_sched_clock(); + cyc = cd.actual_read_sched_clock(); ns = rd->epoch_ns + cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, rd->mult, rd->shift); @@ -166,10 +164,11 @@ void __init sched_clock_register(u64 (*read)(void), int bits, /* update epoch for new counter and update epoch_ns from old counter*/ new_epoch = read(); - cyc = rd->read_sched_clock(); + cyc = cd.actual_read_sched_clock(); ns = rd->epoch_ns + cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, rd->mult, rd->shift); + cd.actual_read_sched_clock = read; raw_write_seqcount_begin(&cd.seq); rd->read_sched_clock = read; @@ -209,7 +208,7 @@ void __init sched_clock_postinit(void) * If no sched_clock function has been provided at that point, * make it the final one one. */ - if (cd.read_data.read_sched_clock == jiffy_sched_clock_read) + if (cd.actual_read_sched_clock == jiffy_sched_clock_read) sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); update_sched_clock(); @@ -223,13 +222,24 @@ void __init sched_clock_postinit(void) hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); } +/* + * Clock read function for use when the clock is suspended. + * + * This function makes it appear to sched_clock() as if the clock + * stopped counting at its last update. + */ +static u64 notrace suspended_sched_clock_read(void) +{ + return cd.read_data.epoch_cyc; +} + static int sched_clock_suspend(void) { struct clock_read_data *rd = &cd.read_data; update_sched_clock(); hrtimer_cancel(&sched_clock_timer); - rd->suspended = true; + rd->read_sched_clock = suspended_sched_clock_read; return 0; } @@ -237,9 +247,9 @@ static void sched_clock_resume(void) { struct clock_read_data *rd = &cd.read_data; - rd->epoch_cyc = rd->read_sched_clock(); + rd->epoch_cyc = cd.actual_read_sched_clock(); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); - rd->suspended = false; + rd->read_sched_clock = cd.actual_read_sched_clock; } static struct syscore_ops sched_clock_ops = { -- cgit v1.2.3 From 9fee69a8c8070b38b558161a3f18bd5e2b664682 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Thu, 26 Mar 2015 12:23:25 -0700 Subject: timers, sched/clock: Remove redundant notrace from update function Currently update_sched_clock() is marked as notrace but this function is not called by ftrace. This is trivially fixed by removing the mark up. Signed-off-by: Daniel Thompson Signed-off-by: John Stultz Reviewed-by: Stephen Boyd Acked-by: Peter Zijlstra (Intel) Cc: Catalin Marinas Cc: Peter Zijlstra Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/1427397806-20889-5-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/sched_clock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/time/sched_clock.c') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 52ea5d976393..8adb9d0c969a 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -111,7 +111,7 @@ unsigned long long notrace sched_clock(void) /* * Atomically update the sched_clock epoch. */ -static void notrace update_sched_clock(void) +static void update_sched_clock(void) { unsigned long flags; u64 cyc; -- cgit v1.2.3 From 1809bfa44e1019e397fabaa6f2349bb7237e57a4 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Thu, 26 Mar 2015 12:23:26 -0700 Subject: timers, sched/clock: Avoid deadlock during read from NMI Currently it is possible for an NMI (or FIQ on ARM) to come in and read sched_clock() whilst update_sched_clock() has locked the seqcount for writing. This results in the NMI handler locking up when it calls raw_read_seqcount_begin(). This patch fixes the NMI safety issues by providing banked clock data. This is a similar approach to the one used in Thomas Gleixner's 4396e058c52e("timekeeping: Provide fast and NMI safe access to CLOCK_MONOTONIC"). Suggested-by: Stephen Boyd Signed-off-by: Daniel Thompson Signed-off-by: John Stultz Reviewed-by: Stephen Boyd Acked-by: Peter Zijlstra (Intel) Cc: Catalin Marinas Cc: Peter Zijlstra Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/1427397806-20889-6-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/sched_clock.c | 103 ++++++++++++++++++++++++++++++---------------- 1 file changed, 68 insertions(+), 35 deletions(-) (limited to 'kernel/time/sched_clock.c') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 8adb9d0c969a..eeea1e950b72 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -47,19 +47,20 @@ struct clock_read_data { * struct clock_data - all data needed for sched_clock (including * registration of a new clock source) * - * @seq: Sequence counter for protecting updates. + * @seq: Sequence counter for protecting updates. The lowest + * bit is the index for @read_data. * @read_data: Data required to read from sched_clock. * @wrap_kt: Duration for which clock can run before wrapping * @rate: Tick rate of the registered clock * @actual_read_sched_clock: Registered clock read function * * The ordering of this structure has been chosen to optimize cache - * performance. In particular seq and read_data (combined) should fit + * performance. In particular seq and read_data[0] (combined) should fit * into a single 64 byte cache line. */ struct clock_data { seqcount_t seq; - struct clock_read_data read_data; + struct clock_read_data read_data[2]; ktime_t wrap_kt; unsigned long rate; u64 (*actual_read_sched_clock)(void); @@ -80,10 +81,9 @@ static u64 notrace jiffy_sched_clock_read(void) } static struct clock_data cd ____cacheline_aligned = { - .read_data = { .mult = NSEC_PER_SEC / HZ, - .read_sched_clock = jiffy_sched_clock_read, }, + .read_data[0] = { .mult = NSEC_PER_SEC / HZ, + .read_sched_clock = jiffy_sched_clock_read, }, .actual_read_sched_clock = jiffy_sched_clock_read, - }; static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) @@ -95,10 +95,11 @@ unsigned long long notrace sched_clock(void) { u64 cyc, res; unsigned long seq; - struct clock_read_data *rd = &cd.read_data; + struct clock_read_data *rd; do { - seq = raw_read_seqcount_begin(&cd.seq); + seq = raw_read_seqcount(&cd.seq); + rd = cd.read_data + (seq & 1); cyc = (rd->read_sched_clock() - rd->epoch_cyc) & rd->sched_clock_mask; @@ -108,27 +109,51 @@ unsigned long long notrace sched_clock(void) return res; } +/* + * Updating the data required to read the clock. + * + * sched_clock will never observe mis-matched data even if called from + * an NMI. We do this by maintaining an odd/even copy of the data and + * steering sched_clock to one or the other using a sequence counter. + * In order to preserve the data cache profile of sched_clock as much + * as possible the system reverts back to the even copy when the update + * completes; the odd copy is used *only* during an update. + */ +static void update_clock_read_data(struct clock_read_data *rd) +{ + /* update the backup (odd) copy with the new data */ + cd.read_data[1] = *rd; + + /* steer readers towards the odd copy */ + raw_write_seqcount_latch(&cd.seq); + + /* now its safe for us to update the normal (even) copy */ + cd.read_data[0] = *rd; + + /* switch readers back to the even copy */ + raw_write_seqcount_latch(&cd.seq); +} + /* * Atomically update the sched_clock epoch. */ static void update_sched_clock(void) { - unsigned long flags; u64 cyc; u64 ns; - struct clock_read_data *rd = &cd.read_data; + struct clock_read_data rd; + + rd = cd.read_data[0]; cyc = cd.actual_read_sched_clock(); - ns = rd->epoch_ns + - cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, - rd->mult, rd->shift); - - raw_local_irq_save(flags); - raw_write_seqcount_begin(&cd.seq); - rd->epoch_ns = ns; - rd->epoch_cyc = cyc; - raw_write_seqcount_end(&cd.seq); - raw_local_irq_restore(flags); + ns = rd.epoch_ns + + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, + rd.mult, rd.shift); + + rd.epoch_ns = ns; + rd.epoch_cyc = cyc; + + update_clock_read_data(&rd); } static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) @@ -145,7 +170,7 @@ void __init sched_clock_register(u64 (*read)(void), int bits, u32 new_mult, new_shift; unsigned long r; char r_unit; - struct clock_read_data *rd = &cd.read_data; + struct clock_read_data rd; if (cd.rate > rate) return; @@ -162,22 +187,23 @@ void __init sched_clock_register(u64 (*read)(void), int bits, wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL); cd.wrap_kt = ns_to_ktime(wrap); + rd = cd.read_data[0]; + /* update epoch for new counter and update epoch_ns from old counter*/ new_epoch = read(); cyc = cd.actual_read_sched_clock(); - ns = rd->epoch_ns + - cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, - rd->mult, rd->shift); + ns = rd.epoch_ns + + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, + rd.mult, rd.shift); cd.actual_read_sched_clock = read; - raw_write_seqcount_begin(&cd.seq); - rd->read_sched_clock = read; - rd->sched_clock_mask = new_mask; - rd->mult = new_mult; - rd->shift = new_shift; - rd->epoch_cyc = new_epoch; - rd->epoch_ns = ns; - raw_write_seqcount_end(&cd.seq); + rd.read_sched_clock = read; + rd.sched_clock_mask = new_mask; + rd.mult = new_mult; + rd.shift = new_shift; + rd.epoch_cyc = new_epoch; + rd.epoch_ns = ns; + update_clock_read_data(&rd); r = rate; if (r >= 4000000) { @@ -227,15 +253,22 @@ void __init sched_clock_postinit(void) * * This function makes it appear to sched_clock() as if the clock * stopped counting at its last update. + * + * This function must only be called from the critical + * section in sched_clock(). It relies on the read_seqcount_retry() + * at the end of the critical section to be sure we observe the + * correct copy of epoch_cyc. */ static u64 notrace suspended_sched_clock_read(void) { - return cd.read_data.epoch_cyc; + unsigned long seq = raw_read_seqcount(&cd.seq); + + return cd.read_data[seq & 1].epoch_cyc; } static int sched_clock_suspend(void) { - struct clock_read_data *rd = &cd.read_data; + struct clock_read_data *rd = &cd.read_data[0]; update_sched_clock(); hrtimer_cancel(&sched_clock_timer); @@ -245,7 +278,7 @@ static int sched_clock_suspend(void) static void sched_clock_resume(void) { - struct clock_read_data *rd = &cd.read_data; + struct clock_read_data *rd = &cd.read_data[0]; rd->epoch_cyc = cd.actual_read_sched_clock(); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); -- cgit v1.2.3 From 32fea568aec5b73ae27253125522b5c2a970a1f0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 27 Mar 2015 07:08:06 +0100 Subject: timers, sched/clock: Clean up the code a bit Trivial cleanups, to improve the readability of the generic sched_clock() code: - Improve and standardize comments - Standardize the coding style - Use vertical spacing where appropriate - etc. No code changed: md5: 19a053b31e0c54feaeff1492012b019a sched_clock.o.before.asm 19a053b31e0c54feaeff1492012b019a sched_clock.o.after.asm Cc: Catalin Marinas Cc: Daniel Thompson Cc: John Stultz Cc: Peter Zijlstra (Intel) Cc: Peter Zijlstra Cc: Russell King Cc: Stephen Boyd Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Ingo Molnar --- kernel/time/sched_clock.c | 107 ++++++++++++++++++++++++---------------------- 1 file changed, 56 insertions(+), 51 deletions(-) (limited to 'kernel/time/sched_clock.c') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index eeea1e950b72..a26036d37a38 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -1,5 +1,6 @@ /* - * sched_clock.c: support for extending counters to full 64-bit ns counter + * sched_clock.c: Generic sched_clock() support, to extend low level + * hardware time counters to full 64-bit ns values. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -19,15 +20,15 @@ #include /** - * struct clock_read_data - data required to read from sched_clock + * struct clock_read_data - data required to read from sched_clock() * - * @epoch_ns: sched_clock value at last update - * @epoch_cyc: Clock cycle value at last update + * @epoch_ns: sched_clock() value at last update + * @epoch_cyc: Clock cycle value at last update. * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit - * clocks - * @read_sched_clock: Current clock source (or dummy source when suspended) - * @mult: Multipler for scaled math conversion - * @shift: Shift value for scaled math conversion + * clocks. + * @read_sched_clock: Current clock source (or dummy source when suspended). + * @mult: Multipler for scaled math conversion. + * @shift: Shift value for scaled math conversion. * * Care must be taken when updating this structure; it is read by * some very hot code paths. It occupies <=40 bytes and, when combined @@ -44,25 +45,26 @@ struct clock_read_data { }; /** - * struct clock_data - all data needed for sched_clock (including + * struct clock_data - all data needed for sched_clock() (including * registration of a new clock source) * * @seq: Sequence counter for protecting updates. The lowest * bit is the index for @read_data. * @read_data: Data required to read from sched_clock. - * @wrap_kt: Duration for which clock can run before wrapping - * @rate: Tick rate of the registered clock - * @actual_read_sched_clock: Registered clock read function + * @wrap_kt: Duration for which clock can run before wrapping. + * @rate: Tick rate of the registered clock. + * @actual_read_sched_clock: Registered hardware level clock read function. * * The ordering of this structure has been chosen to optimize cache - * performance. In particular seq and read_data[0] (combined) should fit - * into a single 64 byte cache line. + * performance. In particular 'seq' and 'read_data[0]' (combined) should fit + * into a single 64-byte cache line. */ struct clock_data { - seqcount_t seq; - struct clock_read_data read_data[2]; - ktime_t wrap_kt; - unsigned long rate; + seqcount_t seq; + struct clock_read_data read_data[2]; + ktime_t wrap_kt; + unsigned long rate; + u64 (*actual_read_sched_clock)(void); }; @@ -112,10 +114,10 @@ unsigned long long notrace sched_clock(void) /* * Updating the data required to read the clock. * - * sched_clock will never observe mis-matched data even if called from + * sched_clock() will never observe mis-matched data even if called from * an NMI. We do this by maintaining an odd/even copy of the data and - * steering sched_clock to one or the other using a sequence counter. - * In order to preserve the data cache profile of sched_clock as much + * steering sched_clock() to one or the other using a sequence counter. + * In order to preserve the data cache profile of sched_clock() as much * as possible the system reverts back to the even copy when the update * completes; the odd copy is used *only* during an update. */ @@ -135,7 +137,7 @@ static void update_clock_read_data(struct clock_read_data *rd) } /* - * Atomically update the sched_clock epoch. + * Atomically update the sched_clock() epoch. */ static void update_sched_clock(void) { @@ -146,9 +148,7 @@ static void update_sched_clock(void) rd = cd.read_data[0]; cyc = cd.actual_read_sched_clock(); - ns = rd.epoch_ns + - cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, - rd.mult, rd.shift); + ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift); rd.epoch_ns = ns; rd.epoch_cyc = cyc; @@ -160,11 +160,12 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) { update_sched_clock(); hrtimer_forward_now(hrt, cd.wrap_kt); + return HRTIMER_RESTART; } -void __init sched_clock_register(u64 (*read)(void), int bits, - unsigned long rate) +void __init +sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) { u64 res, wrap, new_mask, new_epoch, cyc, ns; u32 new_mult, new_shift; @@ -177,51 +178,53 @@ void __init sched_clock_register(u64 (*read)(void), int bits, WARN_ON(!irqs_disabled()); - /* calculate the mult/shift to convert counter ticks to ns. */ + /* Calculate the mult/shift to convert counter ticks to ns. */ clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); new_mask = CLOCKSOURCE_MASK(bits); cd.rate = rate; - /* calculate how many nanosecs until we risk wrapping */ + /* Calculate how many nanosecs until we risk wrapping */ wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL); cd.wrap_kt = ns_to_ktime(wrap); rd = cd.read_data[0]; - /* update epoch for new counter and update epoch_ns from old counter*/ + /* Update epoch for new counter and update 'epoch_ns' from old counter*/ new_epoch = read(); cyc = cd.actual_read_sched_clock(); - ns = rd.epoch_ns + - cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, - rd.mult, rd.shift); + ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift); cd.actual_read_sched_clock = read; - rd.read_sched_clock = read; - rd.sched_clock_mask = new_mask; - rd.mult = new_mult; - rd.shift = new_shift; - rd.epoch_cyc = new_epoch; - rd.epoch_ns = ns; + rd.read_sched_clock = read; + rd.sched_clock_mask = new_mask; + rd.mult = new_mult; + rd.shift = new_shift; + rd.epoch_cyc = new_epoch; + rd.epoch_ns = ns; + update_clock_read_data(&rd); r = rate; if (r >= 4000000) { r /= 1000000; r_unit = 'M'; - } else if (r >= 1000) { - r /= 1000; - r_unit = 'k'; - } else - r_unit = ' '; - - /* calculate the ns resolution of this counter */ + } else { + if (r >= 1000) { + r /= 1000; + r_unit = 'k'; + } else { + r_unit = ' '; + } + } + + /* Calculate the ns resolution of this counter */ res = cyc_to_ns(1ULL, new_mult, new_shift); pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", bits, r, r_unit, res, wrap); - /* Enable IRQ time accounting if we have a fast enough sched_clock */ + /* Enable IRQ time accounting if we have a fast enough sched_clock() */ if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) enable_sched_clock_irqtime(); @@ -231,7 +234,7 @@ void __init sched_clock_register(u64 (*read)(void), int bits, void __init sched_clock_postinit(void) { /* - * If no sched_clock function has been provided at that point, + * If no sched_clock() function has been provided at that point, * make it the final one one. */ if (cd.actual_read_sched_clock == jiffy_sched_clock_read) @@ -257,7 +260,7 @@ void __init sched_clock_postinit(void) * This function must only be called from the critical * section in sched_clock(). It relies on the read_seqcount_retry() * at the end of the critical section to be sure we observe the - * correct copy of epoch_cyc. + * correct copy of 'epoch_cyc'. */ static u64 notrace suspended_sched_clock_read(void) { @@ -273,6 +276,7 @@ static int sched_clock_suspend(void) update_sched_clock(); hrtimer_cancel(&sched_clock_timer); rd->read_sched_clock = suspended_sched_clock_read; + return 0; } @@ -286,13 +290,14 @@ static void sched_clock_resume(void) } static struct syscore_ops sched_clock_ops = { - .suspend = sched_clock_suspend, - .resume = sched_clock_resume, + .suspend = sched_clock_suspend, + .resume = sched_clock_resume, }; static int __init sched_clock_syscore_init(void) { register_syscore_ops(&sched_clock_ops); + return 0; } device_initcall(sched_clock_syscore_init); -- cgit v1.2.3