summaryrefslogtreecommitdiff
path: root/kernel/time
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/time')
-rw-r--r--kernel/time/posix-clock.c3
-rw-r--r--kernel/time/sched_clock.c34
-rw-r--r--kernel/time/tick-sched.c6
-rw-r--r--kernel/time/timekeeping.c117
-rw-r--r--kernel/time/timekeeping_debug.c13
-rw-r--r--kernel/time/timekeeping_internal.h15
6 files changed, 173 insertions, 15 deletions
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index c2f3d0c490d5..1af0bb2cc45c 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -309,6 +309,9 @@ static int pc_clock_settime(clockid_t id, const struct timespec64 *ts)
struct posix_clock_desc cd;
int err;
+ if (!timespec64_valid_strict(ts))
+ return -EINVAL;
+
err = get_clock_desc(id, &cd);
if (err)
return err;
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 68d6c1190ac7..fcca4e72f1ef 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -71,16 +71,16 @@ static __always_inline u64 cyc_to_ns(u64 cyc, u32 mult, u32 shift)
notrace struct clock_read_data *sched_clock_read_begin(unsigned int *seq)
{
- *seq = raw_read_seqcount_latch(&cd.seq);
+ *seq = read_seqcount_latch(&cd.seq);
return cd.read_data + (*seq & 1);
}
notrace int sched_clock_read_retry(unsigned int seq)
{
- return raw_read_seqcount_latch_retry(&cd.seq, seq);
+ return read_seqcount_latch_retry(&cd.seq, seq);
}
-unsigned long long noinstr sched_clock_noinstr(void)
+static __always_inline unsigned long long __sched_clock(void)
{
struct clock_read_data *rd;
unsigned int seq;
@@ -98,11 +98,23 @@ unsigned long long noinstr sched_clock_noinstr(void)
return res;
}
+unsigned long long noinstr sched_clock_noinstr(void)
+{
+ return __sched_clock();
+}
+
unsigned long long notrace sched_clock(void)
{
unsigned long long ns;
preempt_disable_notrace();
- ns = sched_clock_noinstr();
+ /*
+ * All of __sched_clock() is a seqcount_latch reader critical section,
+ * but relies on the raw helpers which are uninstrumented. For KCSAN,
+ * mark all accesses in __sched_clock() as atomic.
+ */
+ kcsan_nestable_atomic_begin();
+ ns = __sched_clock();
+ kcsan_nestable_atomic_end();
preempt_enable_notrace();
return ns;
}
@@ -119,17 +131,19 @@ unsigned long long notrace sched_clock(void)
*/
static void update_clock_read_data(struct clock_read_data *rd)
{
- /* update the backup (odd) copy with the new data */
- cd.read_data[1] = *rd;
-
/* steer readers towards the odd copy */
- raw_write_seqcount_latch(&cd.seq);
+ write_seqcount_latch_begin(&cd.seq);
/* now its safe for us to update the normal (even) copy */
cd.read_data[0] = *rd;
/* switch readers back to the even copy */
- raw_write_seqcount_latch(&cd.seq);
+ write_seqcount_latch(&cd.seq);
+
+ /* update the backup (odd) copy with the new data */
+ cd.read_data[1] = *rd;
+
+ write_seqcount_latch_end(&cd.seq);
}
/*
@@ -267,7 +281,7 @@ void __init generic_sched_clock_init(void)
*/
static u64 notrace suspended_sched_clock_read(void)
{
- unsigned int seq = raw_read_seqcount_latch(&cd.seq);
+ unsigned int seq = read_seqcount_latch(&cd.seq);
return cd.read_data[seq & 1].epoch_cyc;
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 753a184c7090..f203f000da1a 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -434,6 +434,12 @@ static void tick_nohz_kick_task(struct task_struct *tsk)
* smp_mb__after_spin_lock()
* tick_nohz_task_switch()
* LOAD p->tick_dep_mask
+ *
+ * XXX given a task picks up the dependency on schedule(), should we
+ * only care about tasks that are currently on the CPU instead of all
+ * that are on the runqueue?
+ *
+ * That is, does this want to be: task_on_cpu() / task_curr()?
*/
if (!sched_task_on_rq(tsk))
return;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 7e6f409bf311..cdd61990cd14 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -114,6 +114,23 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = {
.base[1] = FAST_TK_INIT,
};
+/*
+ * Multigrain timestamps require tracking the latest fine-grained timestamp
+ * that has been issued, and never returning a coarse-grained timestamp that is
+ * earlier than that value.
+ *
+ * mg_floor represents the latest fine-grained time that has been handed out as
+ * a file timestamp on the system. This is tracked as a monotonic ktime_t, and
+ * converted to a realtime clock value on an as-needed basis.
+ *
+ * Maintaining mg_floor ensures the multigrain interfaces never issue a
+ * timestamp earlier than one that has been previously issued.
+ *
+ * The exception to this rule is when there is a backward realtime clock jump. If
+ * such an event occurs, a timestamp can appear to be earlier than a previous one.
+ */
+static __cacheline_aligned_in_smp atomic64_t mg_floor;
+
static inline void tk_normalize_xtime(struct timekeeper *tk)
{
while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
@@ -411,7 +428,7 @@ static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
* We want to use this from any context including NMI and tracing /
* instrumenting the timekeeping code itself.
*
- * Employ the latch technique; see @raw_write_seqcount_latch.
+ * Employ the latch technique; see @write_seqcount_latch.
*
* So if a NMI hits the update of base[0] then it will use base[1]
* which is still consistent. In the worst case this can result is a
@@ -424,16 +441,18 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr,
struct tk_read_base *base = tkf->base;
/* Force readers off to base[1] */
- raw_write_seqcount_latch(&tkf->seq);
+ write_seqcount_latch_begin(&tkf->seq);
/* Update base[0] */
memcpy(base, tkr, sizeof(*base));
/* Force readers back to base[0] */
- raw_write_seqcount_latch(&tkf->seq);
+ write_seqcount_latch(&tkf->seq);
/* Update base[1] */
memcpy(base + 1, base, sizeof(*base));
+
+ write_seqcount_latch_end(&tkf->seq);
}
static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
@@ -443,11 +462,11 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
u64 now;
do {
- seq = raw_read_seqcount_latch(&tkf->seq);
+ seq = read_seqcount_latch(&tkf->seq);
tkr = tkf->base + (seq & 0x01);
now = ktime_to_ns(tkr->base);
now += __timekeeping_get_ns(tkr);
- } while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
+ } while (read_seqcount_latch_retry(&tkf->seq, seq));
return now;
}
@@ -2394,6 +2413,94 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts)
}
EXPORT_SYMBOL(ktime_get_coarse_real_ts64);
+/**
+ * ktime_get_coarse_real_ts64_mg - return latter of coarse grained time or floor
+ * @ts: timespec64 to be filled
+ *
+ * Fetch the global mg_floor value, convert it to realtime and compare it
+ * to the current coarse-grained time. Fill @ts with whichever is
+ * latest. Note that this is a filesystem-specific interface and should be
+ * avoided outside of that context.
+ */
+void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ u64 floor = atomic64_read(&mg_floor);
+ ktime_t f_real, offset, coarse;
+ unsigned int seq;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ *ts = tk_xtime(tk);
+ offset = tk_core.timekeeper.offs_real;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ coarse = timespec64_to_ktime(*ts);
+ f_real = ktime_add(floor, offset);
+ if (ktime_after(f_real, coarse))
+ *ts = ktime_to_timespec64(f_real);
+}
+
+/**
+ * ktime_get_real_ts64_mg - attempt to update floor value and return result
+ * @ts: pointer to the timespec to be set
+ *
+ * Get a monotonic fine-grained time value and attempt to swap it into
+ * mg_floor. If that succeeds then accept the new floor value. If it fails
+ * then another task raced in during the interim time and updated the
+ * floor. Since any update to the floor must be later than the previous
+ * floor, either outcome is acceptable.
+ *
+ * Typically this will be called after calling ktime_get_coarse_real_ts64_mg(),
+ * and determining that the resulting coarse-grained timestamp did not effect
+ * a change in ctime. Any more recent floor value would effect a change to
+ * ctime, so there is no need to retry the atomic64_try_cmpxchg() on failure.
+ *
+ * @ts will be filled with the latest floor value, regardless of the outcome of
+ * the cmpxchg. Note that this is a filesystem specific interface and should be
+ * avoided outside of that context.
+ */
+void ktime_get_real_ts64_mg(struct timespec64 *ts)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ ktime_t old = atomic64_read(&mg_floor);
+ ktime_t offset, mono;
+ unsigned int seq;
+ u64 nsecs;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+
+ ts->tv_sec = tk->xtime_sec;
+ mono = tk->tkr_mono.base;
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
+ offset = tk_core.timekeeper.offs_real;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ mono = ktime_add_ns(mono, nsecs);
+
+ /*
+ * Attempt to update the floor with the new time value. As any
+ * update must be later then the existing floor, and would effect
+ * a change to ctime from the perspective of the current task,
+ * accept the resulting floor value regardless of the outcome of
+ * the swap.
+ */
+ if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) {
+ ts->tv_nsec = 0;
+ timespec64_add_ns(ts, nsecs);
+ timekeeping_inc_mg_floor_swaps();
+ } else {
+ /*
+ * Another task changed mg_floor since "old" was fetched.
+ * "old" has been updated with the latest value of "mg_floor".
+ * That value is newer than the previous floor value, which
+ * is enough to effect a change to ctime. Accept it.
+ */
+ *ts = ktime_to_timespec64(ktime_add(old, offset));
+ }
+}
+
void ktime_get_coarse_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index b73e8850e58d..badeb222eab9 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -17,6 +17,9 @@
#define NUM_BINS 32
+/* Incremented every time mg_floor is updated */
+DEFINE_PER_CPU(unsigned long, timekeeping_mg_floor_swaps);
+
static unsigned int sleep_time_bin[NUM_BINS] = {0};
static int tk_debug_sleep_time_show(struct seq_file *s, void *data)
@@ -53,3 +56,13 @@ void tk_debug_account_sleep_time(const struct timespec64 *t)
(s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC);
}
+unsigned long timekeeping_get_mg_floor_swaps(void)
+{
+ unsigned long sum = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ sum += data_race(per_cpu(timekeeping_mg_floor_swaps, cpu));
+
+ return sum;
+}
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 4ca2787d1642..0bbae825bc02 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -10,9 +10,24 @@
* timekeeping debug functions
*/
#ifdef CONFIG_DEBUG_FS
+
+DECLARE_PER_CPU(unsigned long, timekeeping_mg_floor_swaps);
+
+static inline void timekeeping_inc_mg_floor_swaps(void)
+{
+ this_cpu_inc(timekeeping_mg_floor_swaps);
+}
+
extern void tk_debug_account_sleep_time(const struct timespec64 *t);
+
#else
+
#define tk_debug_account_sleep_time(x)
+
+static inline void timekeeping_inc_mg_floor_swaps(void)
+{
+}
+
#endif
#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE