diff options
Diffstat (limited to 'kernel/time')
-rw-r--r-- | kernel/time/posix-clock.c | 3 | ||||
-rw-r--r-- | kernel/time/sched_clock.c | 34 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 6 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 117 | ||||
-rw-r--r-- | kernel/time/timekeeping_debug.c | 13 | ||||
-rw-r--r-- | kernel/time/timekeeping_internal.h | 15 |
6 files changed, 173 insertions, 15 deletions
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index c2f3d0c490d5..1af0bb2cc45c 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -309,6 +309,9 @@ static int pc_clock_settime(clockid_t id, const struct timespec64 *ts) struct posix_clock_desc cd; int err; + if (!timespec64_valid_strict(ts)) + return -EINVAL; + err = get_clock_desc(id, &cd); if (err) return err; diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 68d6c1190ac7..fcca4e72f1ef 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -71,16 +71,16 @@ static __always_inline u64 cyc_to_ns(u64 cyc, u32 mult, u32 shift) notrace struct clock_read_data *sched_clock_read_begin(unsigned int *seq) { - *seq = raw_read_seqcount_latch(&cd.seq); + *seq = read_seqcount_latch(&cd.seq); return cd.read_data + (*seq & 1); } notrace int sched_clock_read_retry(unsigned int seq) { - return raw_read_seqcount_latch_retry(&cd.seq, seq); + return read_seqcount_latch_retry(&cd.seq, seq); } -unsigned long long noinstr sched_clock_noinstr(void) +static __always_inline unsigned long long __sched_clock(void) { struct clock_read_data *rd; unsigned int seq; @@ -98,11 +98,23 @@ unsigned long long noinstr sched_clock_noinstr(void) return res; } +unsigned long long noinstr sched_clock_noinstr(void) +{ + return __sched_clock(); +} + unsigned long long notrace sched_clock(void) { unsigned long long ns; preempt_disable_notrace(); - ns = sched_clock_noinstr(); + /* + * All of __sched_clock() is a seqcount_latch reader critical section, + * but relies on the raw helpers which are uninstrumented. For KCSAN, + * mark all accesses in __sched_clock() as atomic. + */ + kcsan_nestable_atomic_begin(); + ns = __sched_clock(); + kcsan_nestable_atomic_end(); preempt_enable_notrace(); return ns; } @@ -119,17 +131,19 @@ unsigned long long notrace sched_clock(void) */ static void update_clock_read_data(struct clock_read_data *rd) { - /* update the backup (odd) copy with the new data */ - cd.read_data[1] = *rd; - /* steer readers towards the odd copy */ - raw_write_seqcount_latch(&cd.seq); + write_seqcount_latch_begin(&cd.seq); /* now its safe for us to update the normal (even) copy */ cd.read_data[0] = *rd; /* switch readers back to the even copy */ - raw_write_seqcount_latch(&cd.seq); + write_seqcount_latch(&cd.seq); + + /* update the backup (odd) copy with the new data */ + cd.read_data[1] = *rd; + + write_seqcount_latch_end(&cd.seq); } /* @@ -267,7 +281,7 @@ void __init generic_sched_clock_init(void) */ static u64 notrace suspended_sched_clock_read(void) { - unsigned int seq = raw_read_seqcount_latch(&cd.seq); + unsigned int seq = read_seqcount_latch(&cd.seq); return cd.read_data[seq & 1].epoch_cyc; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 753a184c7090..f203f000da1a 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -434,6 +434,12 @@ static void tick_nohz_kick_task(struct task_struct *tsk) * smp_mb__after_spin_lock() * tick_nohz_task_switch() * LOAD p->tick_dep_mask + * + * XXX given a task picks up the dependency on schedule(), should we + * only care about tasks that are currently on the CPU instead of all + * that are on the runqueue? + * + * That is, does this want to be: task_on_cpu() / task_curr()? */ if (!sched_task_on_rq(tsk)) return; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7e6f409bf311..cdd61990cd14 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -114,6 +114,23 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = { .base[1] = FAST_TK_INIT, }; +/* + * Multigrain timestamps require tracking the latest fine-grained timestamp + * that has been issued, and never returning a coarse-grained timestamp that is + * earlier than that value. + * + * mg_floor represents the latest fine-grained time that has been handed out as + * a file timestamp on the system. This is tracked as a monotonic ktime_t, and + * converted to a realtime clock value on an as-needed basis. + * + * Maintaining mg_floor ensures the multigrain interfaces never issue a + * timestamp earlier than one that has been previously issued. + * + * The exception to this rule is when there is a backward realtime clock jump. If + * such an event occurs, a timestamp can appear to be earlier than a previous one. + */ +static __cacheline_aligned_in_smp atomic64_t mg_floor; + static inline void tk_normalize_xtime(struct timekeeper *tk) { while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) { @@ -411,7 +428,7 @@ static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) * We want to use this from any context including NMI and tracing / * instrumenting the timekeeping code itself. * - * Employ the latch technique; see @raw_write_seqcount_latch. + * Employ the latch technique; see @write_seqcount_latch. * * So if a NMI hits the update of base[0] then it will use base[1] * which is still consistent. In the worst case this can result is a @@ -424,16 +441,18 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr, struct tk_read_base *base = tkf->base; /* Force readers off to base[1] */ - raw_write_seqcount_latch(&tkf->seq); + write_seqcount_latch_begin(&tkf->seq); /* Update base[0] */ memcpy(base, tkr, sizeof(*base)); /* Force readers back to base[0] */ - raw_write_seqcount_latch(&tkf->seq); + write_seqcount_latch(&tkf->seq); /* Update base[1] */ memcpy(base + 1, base, sizeof(*base)); + + write_seqcount_latch_end(&tkf->seq); } static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) @@ -443,11 +462,11 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) u64 now; do { - seq = raw_read_seqcount_latch(&tkf->seq); + seq = read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); now += __timekeeping_get_ns(tkr); - } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); + } while (read_seqcount_latch_retry(&tkf->seq, seq)); return now; } @@ -2394,6 +2413,94 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts) } EXPORT_SYMBOL(ktime_get_coarse_real_ts64); +/** + * ktime_get_coarse_real_ts64_mg - return latter of coarse grained time or floor + * @ts: timespec64 to be filled + * + * Fetch the global mg_floor value, convert it to realtime and compare it + * to the current coarse-grained time. Fill @ts with whichever is + * latest. Note that this is a filesystem-specific interface and should be + * avoided outside of that context. + */ +void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + u64 floor = atomic64_read(&mg_floor); + ktime_t f_real, offset, coarse; + unsigned int seq; + + do { + seq = read_seqcount_begin(&tk_core.seq); + *ts = tk_xtime(tk); + offset = tk_core.timekeeper.offs_real; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + coarse = timespec64_to_ktime(*ts); + f_real = ktime_add(floor, offset); + if (ktime_after(f_real, coarse)) + *ts = ktime_to_timespec64(f_real); +} + +/** + * ktime_get_real_ts64_mg - attempt to update floor value and return result + * @ts: pointer to the timespec to be set + * + * Get a monotonic fine-grained time value and attempt to swap it into + * mg_floor. If that succeeds then accept the new floor value. If it fails + * then another task raced in during the interim time and updated the + * floor. Since any update to the floor must be later than the previous + * floor, either outcome is acceptable. + * + * Typically this will be called after calling ktime_get_coarse_real_ts64_mg(), + * and determining that the resulting coarse-grained timestamp did not effect + * a change in ctime. Any more recent floor value would effect a change to + * ctime, so there is no need to retry the atomic64_try_cmpxchg() on failure. + * + * @ts will be filled with the latest floor value, regardless of the outcome of + * the cmpxchg. Note that this is a filesystem specific interface and should be + * avoided outside of that context. + */ +void ktime_get_real_ts64_mg(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + ktime_t old = atomic64_read(&mg_floor); + ktime_t offset, mono; + unsigned int seq; + u64 nsecs; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + ts->tv_sec = tk->xtime_sec; + mono = tk->tkr_mono.base; + nsecs = timekeeping_get_ns(&tk->tkr_mono); + offset = tk_core.timekeeper.offs_real; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + mono = ktime_add_ns(mono, nsecs); + + /* + * Attempt to update the floor with the new time value. As any + * update must be later then the existing floor, and would effect + * a change to ctime from the perspective of the current task, + * accept the resulting floor value regardless of the outcome of + * the swap. + */ + if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) { + ts->tv_nsec = 0; + timespec64_add_ns(ts, nsecs); + timekeeping_inc_mg_floor_swaps(); + } else { + /* + * Another task changed mg_floor since "old" was fetched. + * "old" has been updated with the latest value of "mg_floor". + * That value is newer than the previous floor value, which + * is enough to effect a change to ctime. Accept it. + */ + *ts = ktime_to_timespec64(ktime_add(old, offset)); + } +} + void ktime_get_coarse_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index b73e8850e58d..badeb222eab9 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -17,6 +17,9 @@ #define NUM_BINS 32 +/* Incremented every time mg_floor is updated */ +DEFINE_PER_CPU(unsigned long, timekeeping_mg_floor_swaps); + static unsigned int sleep_time_bin[NUM_BINS] = {0}; static int tk_debug_sleep_time_show(struct seq_file *s, void *data) @@ -53,3 +56,13 @@ void tk_debug_account_sleep_time(const struct timespec64 *t) (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC); } +unsigned long timekeeping_get_mg_floor_swaps(void) +{ + unsigned long sum = 0; + int cpu; + + for_each_possible_cpu(cpu) + sum += data_race(per_cpu(timekeeping_mg_floor_swaps, cpu)); + + return sum; +} diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 4ca2787d1642..0bbae825bc02 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -10,9 +10,24 @@ * timekeeping debug functions */ #ifdef CONFIG_DEBUG_FS + +DECLARE_PER_CPU(unsigned long, timekeeping_mg_floor_swaps); + +static inline void timekeeping_inc_mg_floor_swaps(void) +{ + this_cpu_inc(timekeeping_mg_floor_swaps); +} + extern void tk_debug_account_sleep_time(const struct timespec64 *t); + #else + #define tk_debug_account_sleep_time(x) + +static inline void timekeeping_inc_mg_floor_swaps(void) +{ +} + #endif #ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE |