From 1c97be677f72b3c338312aecd36d8fff20322f32 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 20 Sep 2015 22:02:17 -0700 Subject: list: Use WRITE_ONCE() when adding to lists and hlists Code that does lockless emptiness testing of non-RCU lists is relying on the list-addition code to write the list head's ->next pointer atomically. This commit therefore adds WRITE_ONCE() to list-addition pointer stores that could affect the head's ->next pointer. Reported-by: Dmitry Vyukov Signed-off-by: Paul E. McKenney --- include/linux/list.h | 8 ++++---- lib/list_debug.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/list.h b/include/linux/list.h index 993395a2e55c..d7e31fe398b3 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -42,7 +42,7 @@ static inline void __list_add(struct list_head *new, next->prev = new; new->next = next; new->prev = prev; - prev->next = new; + WRITE_ONCE(prev->next, new); } #else extern void __list_add(struct list_head *new, @@ -642,7 +642,7 @@ static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) n->next = first; if (first) first->pprev = &n->next; - h->first = n; + WRITE_ONCE(h->first, n); n->pprev = &h->first; } @@ -653,14 +653,14 @@ static inline void hlist_add_before(struct hlist_node *n, n->pprev = next->pprev; n->next = next; next->pprev = &n->next; - *(n->pprev) = n; + WRITE_ONCE(*(n->pprev), n); } static inline void hlist_add_behind(struct hlist_node *n, struct hlist_node *prev) { n->next = prev->next; - prev->next = n; + WRITE_ONCE(prev->next, n); n->pprev = &prev->next; if (n->next) diff --git a/lib/list_debug.c b/lib/list_debug.c index c24c2f7e296f..3859bf63561c 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -37,7 +37,7 @@ void __list_add(struct list_head *new, next->prev = new; new->next = next; new->prev = prev; - prev->next = new; + WRITE_ONCE(prev->next, new); } EXPORT_SYMBOL(__list_add); -- cgit v1.2.3 From 2a67e741bbbc022e0fadf8c6dbc3a76019ecd0cf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 8 Oct 2015 12:24:23 +0200 Subject: rcu: Create transitive rnp->lock acquisition functions Providing RCU's memory-ordering guarantees requires that the rcu_node tree's locking provide transitive memory ordering, which the Linux kernel's spinlocks currently do not provide unless smp_mb__after_unlock_lock() is used. Having a separate smp_mb__after_unlock_lock() after each and every lock acquisition is error-prone, hard to read, and a bit annoying, so this commit provides wrapper functions that pull in the smp_mb__after_unlock_lock() invocations. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 78 ++++++++++++++++-------------------------------- kernel/rcu/tree.h | 39 ++++++++++++++++++++++++ kernel/rcu/tree_plugin.h | 18 ++++------- 3 files changed, 71 insertions(+), 64 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f07343b54fe5..daf17e248757 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1534,10 +1534,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * hold it, acquire the root rcu_node structure's lock in order to * start one (if needed). */ - if (rnp != rnp_root) { - raw_spin_lock(&rnp_root->lock); - smp_mb__after_unlock_lock(); - } + if (rnp != rnp_root) + raw_spin_lock_rcu_node(rnp_root); /* * Get a new grace-period number. If there really is no grace @@ -1786,11 +1784,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) if ((rdp->gpnum == READ_ONCE(rnp->gpnum) && rdp->completed == READ_ONCE(rnp->completed) && !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */ - !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ + !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */ local_irq_restore(flags); return; } - smp_mb__after_unlock_lock(); needwake = __note_gp_changes(rsp, rnp, rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); if (needwake) @@ -1814,8 +1811,7 @@ static int rcu_gp_init(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); WRITE_ONCE(rsp->gp_activity, jiffies); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); if (!READ_ONCE(rsp->gp_flags)) { /* Spurious wakeup, tell caller to go back to sleep. */ raw_spin_unlock_irq(&rnp->lock); @@ -1847,8 +1843,7 @@ static int rcu_gp_init(struct rcu_state *rsp) */ rcu_for_each_leaf_node(rsp, rnp) { rcu_gp_slow(rsp, gp_preinit_delay); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && !rnp->wait_blkd_tasks) { /* Nothing to do on this leaf rcu_node structure. */ @@ -1904,8 +1899,7 @@ static int rcu_gp_init(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { rcu_gp_slow(rsp, gp_init_delay); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; @@ -1973,8 +1967,7 @@ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) } /* Clear flag to prevent immediate re-entry. */ if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); raw_spin_unlock_irq(&rnp->lock); @@ -1993,8 +1986,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); WRITE_ONCE(rsp->gp_activity, jiffies); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); gp_duration = jiffies - rsp->gp_start; if (gp_duration > rsp->gp_max) rsp->gp_max = gp_duration; @@ -2019,8 +2011,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) * grace period is recorded in any of the rcu_node structures. */ rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); WARN_ON_ONCE(rnp->qsmask); WRITE_ONCE(rnp->completed, rsp->gpnum); @@ -2035,8 +2026,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rcu_gp_slow(rsp, gp_cleanup_delay); } rnp = rcu_get_root(rsp); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */ + raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */ rcu_nocb_gp_set(rnp, nocb); /* Declare grace period done. */ @@ -2284,8 +2274,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, raw_spin_unlock_irqrestore(&rnp->lock, flags); rnp_c = rnp; rnp = rnp->parent; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); oldmask = rnp_c->qsmask; } @@ -2332,8 +2321,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, gps = rnp->gpnum; mask = rnp->grpmask; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */ rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); } @@ -2355,8 +2343,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) struct rcu_node *rnp; rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if ((rdp->cpu_no_qs.b.norm && rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || @@ -2582,8 +2569,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) rnp = rnp->parent; if (!rnp) break; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); /* GP memory ordering. */ + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ rnp->qsmaskinit &= ~mask; rnp->qsmask &= ~mask; if (rnp->qsmaskinit) { @@ -2611,8 +2597,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ rnp->qsmaskinitnext &= ~mask; raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -2809,8 +2794,7 @@ static void force_qs_rnp(struct rcu_state *rsp, rcu_for_each_leaf_node(rsp, rnp) { cond_resched_rcu_qs(); mask = 0; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask == 0) { if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || @@ -2881,8 +2865,7 @@ static void force_quiescent_state(struct rcu_state *rsp) /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ /* Reached the root of the rcu_node tree, acquire lock. */ - raw_spin_lock_irqsave(&rnp_old->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp_old, flags); raw_spin_unlock(&rnp_old->fqslock); if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { rsp->n_force_qs_lh++; @@ -3005,8 +2988,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, if (!rcu_gp_in_progress(rsp)) { struct rcu_node *rnp_root = rcu_get_root(rsp); - raw_spin_lock(&rnp_root->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp_root); needwake = rcu_start_gp(rsp); raw_spin_unlock(&rnp_root->lock); if (needwake) @@ -3426,8 +3408,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) * CPUs for the current rcu_node structure up the rcu_node tree. */ rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmaskinit == rnp->expmaskinitnext) { raw_spin_unlock_irqrestore(&rnp->lock, flags); continue; /* No new CPUs, nothing to do. */ @@ -3447,8 +3428,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) rnp_up = rnp->parent; done = false; while (rnp_up) { - raw_spin_lock_irqsave(&rnp_up->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp_up, flags); if (rnp_up->expmaskinit) done = true; rnp_up->expmaskinit |= mask; @@ -3472,8 +3452,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) sync_exp_reset_tree_hotplug(rsp); rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); WARN_ON_ONCE(rnp->expmask); rnp->expmask = rnp->expmaskinit; raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -3531,8 +3510,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, mask = rnp->grpmask; raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ rnp = rnp->parent; - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ WARN_ON_ONCE(!(rnp->expmask & mask)); rnp->expmask &= ~mask; } @@ -3549,8 +3527,7 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, { unsigned long flags; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); __rcu_report_exp_rnp(rsp, rnp, wake, flags); } @@ -3564,8 +3541,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, { unsigned long flags; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!(rnp->expmask & mask)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; @@ -3708,8 +3684,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, sync_exp_reset_tree(rsp); rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Each pass checks a CPU for identity, offline, and idle. */ mask_ofl_test = 0; @@ -4198,8 +4173,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) */ rnp = rdp->mynode; mask = rdp->grpmask; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ rnp->qsmaskinitnext |= mask; rnp->expmaskinitnext |= mask; if (!rdp->beenonline) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9fb4e238d4dc..f32bebb6bc90 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -664,3 +664,42 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) #else /* #ifdef CONFIG_PPC */ #define smp_mb__after_unlock_lock() do { } while (0) #endif /* #else #ifdef CONFIG_PPC */ + +/* + * Wrappers for the rcu_node::lock acquire. + * + * Because the rcu_nodes form a tree, the tree traversal locking will observe + * different lock values, this in turn means that an UNLOCK of one level + * followed by a LOCK of another level does not imply a full memory barrier; + * and most importantly transitivity is lost. + * + * In order to restore full ordering between tree levels, augment the regular + * lock acquire functions with smp_mb__after_unlock_lock(). + */ +static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp) +{ + raw_spin_lock(&rnp->lock); + smp_mb__after_unlock_lock(); +} + +static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp) +{ + raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); +} + +#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ +do { \ + typecheck(unsigned long, flags); \ + raw_spin_lock_irqsave(&(rnp)->lock, flags); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp) +{ + bool locked = raw_spin_trylock(&rnp->lock); + + if (locked) + smp_mb__after_unlock_lock(); + return locked; +} diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 630c19772630..fa0e3b96a9ed 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -301,8 +301,7 @@ static void rcu_preempt_note_context_switch(void) /* Possibly blocking in an RCU read-side critical section. */ rdp = this_cpu_ptr(rcu_state_p->rda); rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); t->rcu_read_unlock_special.b.blocked = true; t->rcu_blocked_node = rnp; @@ -457,8 +456,7 @@ void rcu_read_unlock_special(struct task_struct *t) */ for (;;) { rnp = t->rcu_blocked_node; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ if (rnp == t->rcu_blocked_node) break; WARN_ON_ONCE(1); @@ -989,8 +987,7 @@ static int rcu_boost(struct rcu_node *rnp) READ_ONCE(rnp->boost_tasks) == NULL) return 0; /* Nothing left to boost. */ - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* * Recheck under the lock: all tasks in need of boosting @@ -1176,8 +1173,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, "rcub/%d", rnp_index); if (IS_ERR(t)) return PTR_ERR(t); - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); sp.sched_priority = kthread_prio; @@ -1567,8 +1563,7 @@ static void rcu_prepare_for_idle(void) if (!*rdp->nxttail[RCU_DONE_TAIL]) continue; rnp = rdp->mynode; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ needwake = rcu_accelerate_cbs(rsp, rnp, rdp); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ if (needwake) @@ -2068,8 +2063,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) bool needwake; struct rcu_node *rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); needwake = rcu_start_future_gp(rnp, rdp, &c); raw_spin_unlock_irqrestore(&rnp->lock, flags); if (needwake) -- cgit v1.2.3 From 1658d35ead5d8dd76f2b2d6ad0e32c08d123faa2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 20 Sep 2015 17:03:16 -0700 Subject: list: Use READ_ONCE() when testing for empty lists Most of the list-empty-check macros (list_empty(), hlist_empty(), hlist_bl_empty(), hlist_nulls_empty(), and hlist_nulls_empty()) use an unadorned load to check the list header. Given that these macros are sometimes invoked without the protection of a lock, this is not sufficient. This commit therefore adds READ_ONCE() calls to them. This commit does not touch llist_empty() because it already has the needed ACCESS_ONCE(). Reported-by: Dmitry Vyukov Signed-off-by: Paul E. McKenney --- include/linux/list.h | 4 ++-- include/linux/list_bl.h | 2 +- include/linux/list_nulls.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/list.h b/include/linux/list.h index d7e31fe398b3..06c2d887a918 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -186,7 +186,7 @@ static inline int list_is_last(const struct list_head *list, */ static inline int list_empty(const struct list_head *head) { - return head->next == head; + return READ_ONCE(head->next) == head; } /** @@ -608,7 +608,7 @@ static inline int hlist_unhashed(const struct hlist_node *h) static inline int hlist_empty(const struct hlist_head *h) { - return !h->first; + return !READ_ONCE(h->first); } static inline void __hlist_del(struct hlist_node *n) diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h index 8132214e8efd..ee7229a6c06a 100644 --- a/include/linux/list_bl.h +++ b/include/linux/list_bl.h @@ -70,7 +70,7 @@ static inline void hlist_bl_set_first(struct hlist_bl_head *h, static inline int hlist_bl_empty(const struct hlist_bl_head *h) { - return !((unsigned long)h->first & ~LIST_BL_LOCKMASK); + return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK); } static inline void hlist_bl_add_head(struct hlist_bl_node *n, diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h index 444d2b1313bd..b01fe1009084 100644 --- a/include/linux/list_nulls.h +++ b/include/linux/list_nulls.h @@ -57,7 +57,7 @@ static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h) static inline int hlist_nulls_empty(const struct hlist_nulls_head *h) { - return is_a_nulls(h->first); + return is_a_nulls(READ_ONCE(h->first)); } static inline void hlist_nulls_add_head(struct hlist_nulls_node *n, -- cgit v1.2.3 From 6cf10081220ae21175a867d446b3167bcbcb937b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 8 Oct 2015 15:36:54 -0700 Subject: rcu: Add transitivity to remaining rcu_node ->lock acquisitions The rule is that all acquisitions of the rcu_node structure's ->lock must provide transitivity: The lock is not acquired that frequently, and sorting out exactly which required it and which did not would be a maintenance nightmare. This commit therefore supplies the needed transitivity to the remaining ->lock acquisitions. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 24 ++++++++++++------------ kernel/rcu/tree_plugin.h | 2 +- kernel/rcu/tree_trace.c | 2 +- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index daf17e248757..81aa1cdc6bc9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1214,7 +1214,7 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) struct rcu_node *rnp; rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask != 0) { for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) if (rnp->qsmask & (1UL << cpu)) @@ -1237,7 +1237,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) /* Only let one CPU complain about others per time interval. */ - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); delta = jiffies - READ_ONCE(rsp->jiffies_stall); if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -1256,7 +1256,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) rsp->name); print_cpu_stall_info_begin(); rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); ndetected += rcu_print_task_stall(rnp); if (rnp->qsmask != 0) { for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) @@ -1327,7 +1327,7 @@ static void print_cpu_stall(struct rcu_state *rsp) rcu_dump_cpu_stacks(rsp); - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) WRITE_ONCE(rsp->jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); @@ -2897,7 +2897,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) /* Does this CPU require a not-yet-started grace period? */ local_irq_save(flags); if (cpu_needs_another_gp(rsp, rdp)) { - raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ + raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */ needwake = rcu_start_gp(rsp); raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); if (needwake) @@ -3718,7 +3718,7 @@ retry_ipi: mask_ofl_ipi &= ~mask; } else { /* Failed, raced with offline. */ - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (cpu_online(cpu) && (rnp->expmask & mask)) { raw_spin_unlock_irqrestore(&rnp->lock, @@ -3727,8 +3727,8 @@ retry_ipi: if (cpu_online(cpu) && (rnp->expmask & mask)) goto retry_ipi; - raw_spin_lock_irqsave(&rnp->lock, - flags); + raw_spin_lock_irqsave_rcu_node(rnp, + flags); } if (!(rnp->expmask & mask)) mask_ofl_ipi &= ~mask; @@ -4110,7 +4110,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) rnp = rnp->parent; if (rnp == NULL) return; - raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */ + raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */ rnp->qsmaskinit |= mask; raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ } @@ -4127,7 +4127,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); @@ -4154,7 +4154,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; @@ -4301,7 +4301,7 @@ static int __init rcu_spawn_gp_kthread(void) t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name); BUG_ON(IS_ERR(t)); rnp = rcu_get_root(rsp); - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rsp->gp_kthread = t; if (kthread_prio) { sp.sched_priority = kthread_prio; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fa0e3b96a9ed..57ba873d2f18 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -525,7 +525,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) unsigned long flags; struct task_struct *t; - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!rcu_preempt_blocked_readers_cgp(rnp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index ef7093cc9b5c..8efaba870d96 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -319,7 +319,7 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) unsigned long gpmax; struct rcu_node *rnp = &rsp->node[0]; - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); completed = READ_ONCE(rsp->completed); gpnum = READ_ONCE(rsp->gpnum); if (completed == gpnum) -- cgit v1.2.3 From 06f60de19d3141f07d954c9275fe7ccca8e96b42 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Sep 2015 08:15:52 -0700 Subject: rcu: Short-circuit synchronize_sched_expedited() if only one CPU If there is only one CPU, then invoking synchronize_sched_expedited() is by definition a grace period. This commit checks for this condition and does a short-circuit return in that case. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 81aa1cdc6bc9..bd2605c144cc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3822,6 +3822,10 @@ void synchronize_sched_expedited(void) struct rcu_node *rnp; struct rcu_state *rsp = &rcu_sched_state; + /* If only one CPU, this is automatically a grace period. */ + if (rcu_blocking_is_gp()) + return; + /* Take a snapshot of the sequence number. */ s = rcu_exp_gp_seq_snap(rsp); -- cgit v1.2.3 From 1de6e56ddc043437d335ee0455a1b34b73510c91 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Sep 2015 09:45:00 -0700 Subject: rcu: Clarify role of ->expmaskinitnext Analogy with the ->qsmaskinitnext field might lead one to believe that ->expmaskinitnext tracks online CPUs. This belief is incorrect: Any CPU that has ever been online will have its bit set in the ->expmaskinitnext field. This commit therefore adds a comment to make this clear, at least to people who read comments. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f32bebb6bc90..8151971a8978 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -178,6 +178,8 @@ struct rcu_node { /* beginning of each expedited GP. */ unsigned long expmaskinitnext; /* Online CPUs for next expedited GP. */ + /* Any CPU that has ever been online will */ + /* have its bit set. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ int grplo; /* lowest-numbered CPU or group here. */ -- cgit v1.2.3 From 886ef5a18a4a771d5fdc0e23ae9373bb35d529e7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Sep 2015 12:34:40 -0700 Subject: rcu: Move smp_mb() from rcu_seq_snap() to rcu_exp_gp_seq_snap() The memory barrier in rcu_seq_snap() is needed only for grace periods, so this commit moves it to the grace-period-oriented wrapper rcu_exp_gp_seq_snap(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bd2605c144cc..a4a0475aede9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3347,7 +3347,6 @@ static unsigned long rcu_seq_snap(unsigned long *sp) { unsigned long s; - smp_mb(); /* Caller's modifications seen first by other CPUs. */ s = (READ_ONCE(*sp) + 3) & ~0x1; smp_mb(); /* Above access must not bleed into critical section. */ return s; @@ -3374,6 +3373,7 @@ static void rcu_exp_gp_seq_end(struct rcu_state *rsp) } static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) { + smp_mb(); /* Caller's modifications seen first by other CPUs. */ return rcu_seq_snap(&rsp->expedited_sequence); } static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) -- cgit v1.2.3 From 1307f2148719cc9e9d12f5fa7d5b3b61ec5aef72 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Sep 2015 15:29:21 -0700 Subject: rcu: Invert sync_rcu_exp_select_cpus() "if" statement This commit saves a couple lines of code and reduces indentation by inverting the sense of an "if" statement in the function sync_rcu_exp_select_cpus(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a4a0475aede9..00f07d6436ce 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3716,24 +3716,22 @@ retry_ipi: ret = smp_call_function_single(cpu, func, rsp, 0); if (!ret) { mask_ofl_ipi &= ~mask; - } else { - /* Failed, raced with offline. */ - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (cpu_online(cpu) && - (rnp->expmask & mask)) { - raw_spin_unlock_irqrestore(&rnp->lock, - flags); - schedule_timeout_uninterruptible(1); - if (cpu_online(cpu) && - (rnp->expmask & mask)) - goto retry_ipi; - raw_spin_lock_irqsave_rcu_node(rnp, - flags); - } - if (!(rnp->expmask & mask)) - mask_ofl_ipi &= ~mask; + continue; + } + /* Failed, raced with offline. */ + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (cpu_online(cpu) && + (rnp->expmask & mask)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); + schedule_timeout_uninterruptible(1); + if (cpu_online(cpu) && + (rnp->expmask & mask)) + goto retry_ipi; + raw_spin_lock_irqsave_rcu_node(rnp, flags); } + if (!(rnp->expmask & mask)) + mask_ofl_ipi &= ~mask; + raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* Report quiescent states for those that went offline. */ mask_ofl_test |= mask_ofl_ipi; -- cgit v1.2.3 From df5bd5144a80a9f6c3807383b11f735dae9caf9d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Oct 2015 10:26:24 -0700 Subject: rcu: Reduce expedited GP memory contention via per-CPU variables Currently, the piggybacked-work checks carried out by sync_exp_work_done() atomically increment a small set of variables (the ->expedited_workdone0, ->expedited_workdone1, ->expedited_workdone2, ->expedited_workdone3 fields in the rcu_state structure), which will form a memory-contention bottleneck given a sufficiently large number of CPUs concurrently invoking either synchronize_rcu_expedited() or synchronize_sched_expedited(). This commit therefore moves these for fields to the per-CPU rcu_data structure, eliminating the memory contention. The show_rcuexp() function also changes to sum up each field in the rcu_data structures. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 11 +++++------ kernel/rcu/tree.h | 8 ++++---- kernel/rcu/tree_trace.c | 18 ++++++++++++------ 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 00f07d6436ce..33d7e2551165 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3585,7 +3585,7 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, */ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) { - struct rcu_data *rdp; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); struct rcu_node *rnp0; struct rcu_node *rnp1 = NULL; @@ -3599,7 +3599,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) { if (mutex_trylock(&rnp0->exp_funnel_mutex)) { if (sync_exp_work_done(rsp, rnp0, NULL, - &rsp->expedited_workdone0, s)) + &rdp->expedited_workdone0, s)) return NULL; return rnp0; } @@ -3613,14 +3613,13 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) * can be inexact, as it is just promoting locality and is not * strictly needed for correctness. */ - rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); - if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s)) + if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s)) return NULL; mutex_lock(&rdp->exp_funnel_mutex); rnp0 = rdp->mynode; for (; rnp0 != NULL; rnp0 = rnp0->parent) { if (sync_exp_work_done(rsp, rnp1, rdp, - &rsp->expedited_workdone2, s)) + &rdp->expedited_workdone2, s)) return NULL; mutex_lock(&rnp0->exp_funnel_mutex); if (rnp1) @@ -3630,7 +3629,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) rnp1 = rnp0; } if (sync_exp_work_done(rsp, rnp1, rdp, - &rsp->expedited_workdone3, s)) + &rdp->expedited_workdone3, s)) return NULL; return rnp1; } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8151971a8978..6cbec31f99d6 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -386,6 +386,10 @@ struct rcu_data { struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ struct mutex exp_funnel_mutex; + atomic_long_t expedited_workdone0; /* # done by others #0. */ + atomic_long_t expedited_workdone1; /* # done by others #1. */ + atomic_long_t expedited_workdone2; /* # done by others #2. */ + atomic_long_t expedited_workdone3; /* # done by others #3. */ /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU @@ -500,10 +504,6 @@ struct rcu_state { /* End of fields guarded by barrier_mutex. */ unsigned long expedited_sequence; /* Take a ticket. */ - atomic_long_t expedited_workdone0; /* # done by others #0. */ - atomic_long_t expedited_workdone1; /* # done by others #1. */ - atomic_long_t expedited_workdone2; /* # done by others #2. */ - atomic_long_t expedited_workdone3; /* # done by others #3. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */ wait_queue_head_t expedited_wq; /* Wait for check-ins. */ diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 8efaba870d96..d43649450ea4 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -183,14 +183,20 @@ static const struct file_operations rcudata_fops = { static int show_rcuexp(struct seq_file *m, void *v) { + int cpu; struct rcu_state *rsp = (struct rcu_state *)m->private; - + struct rcu_data *rdp; + unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0; + + for_each_possible_cpu(cpu) { + rdp = per_cpu_ptr(rsp->rda, cpu); + s0 += atomic_long_read(&rdp->expedited_workdone0); + s1 += atomic_long_read(&rdp->expedited_workdone1); + s2 += atomic_long_read(&rdp->expedited_workdone2); + s3 += atomic_long_read(&rdp->expedited_workdone3); + } seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", - rsp->expedited_sequence, - atomic_long_read(&rsp->expedited_workdone0), - atomic_long_read(&rsp->expedited_workdone1), - atomic_long_read(&rsp->expedited_workdone2), - atomic_long_read(&rsp->expedited_workdone3), + rsp->expedited_sequence, s0, s1, s2, s3, atomic_long_read(&rsp->expedited_normal), atomic_read(&rsp->expedited_need_qs), rsp->expedited_sequence / 2); -- cgit v1.2.3 From 73f36f9de8bed78bcda2704a348594c20518b455 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Nov 2015 10:56:55 -0800 Subject: rcu: Make expedited grace periods resolve stall-warning ties Currently, if a grace period ends just as the stall-warning timeout fires, an empty stall warning will be printed. This is not helpful, so this commit avoids these useless warnings by rechecking completion after awakening in synchronize_sched_expedited_wait(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 33d7e2551165..bc6b79716a86 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3757,7 +3757,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) rsp->expedited_wq, sync_rcu_preempt_exp_done(rnp_root), jiffies_stall); - if (ret > 0) + if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) return; if (ret < 0) { /* Hit a signal, disable CPU stall warnings. */ -- cgit v1.2.3 From 72611ab9f5d2d384a04e72d560c9c82463115cbf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Nov 2015 13:25:21 -0800 Subject: rcu: Add more diagnostics to expedited stall warning messages. This commit adds print statements that check the rcu_node structure to find which ->expmask bits and which ->exp_tasks structures are blocking the current expedited grace period. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bc6b79716a86..6a652d1f3d7f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3745,6 +3745,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) unsigned long jiffies_stall; unsigned long jiffies_start; unsigned long mask; + int ndetected; struct rcu_node *rnp; struct rcu_node *rnp_root = rcu_get_root(rsp); int ret; @@ -3767,14 +3768,16 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) } pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rsp->name); + ndetected = 0; rcu_for_each_leaf_node(rsp, rnp) { - (void)rcu_print_task_exp_stall(rnp); + ndetected = rcu_print_task_exp_stall(rnp); mask = 1; for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { struct rcu_data *rdp; if (!(rnp->expmask & mask)) continue; + ndetected++; rdp = per_cpu_ptr(rsp->rda, cpu); pr_cont(" %d-%c%c%c", cpu, "O."[cpu_online(cpu)], @@ -3783,8 +3786,23 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) } mask <<= 1; } - pr_cont(" } %lu jiffies s: %lu\n", - jiffies - jiffies_start, rsp->expedited_sequence); + pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", + jiffies - jiffies_start, rsp->expedited_sequence, + rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); + if (!ndetected) { + pr_err("blocking rcu_node structures:"); + rcu_for_each_node_breadth_first(rsp, rnp) { + if (rnp == rnp_root) + continue; /* printed unconditionally */ + if (sync_rcu_preempt_exp_done(rnp)) + continue; + pr_cont(" l=%u:%d-%d:%#lx/%c", + rnp->level, rnp->grplo, rnp->grphi, + rnp->expmask, + ".T"[!!rnp->exp_tasks]); + } + pr_cont("\n"); + } rcu_for_each_leaf_node(rsp, rnp) { mask = 1; for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { -- cgit v1.2.3 From 5a9be7c628c5273f84abacebf7faf2488376e0f0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Nov 2015 15:44:06 -0800 Subject: rcu: Add rcu_normal kernel parameter to suppress expediting Although expedited grace periods can be quite useful, and although their OS jitter has been greatly reduced, they can still pose problems for extreme real-time workloads. This commit therefore adds a rcu_normal kernel boot parameter (which can also be manipulated via sysfs) to suppress expedited grace periods, that is, to treat requests for expedited grace periods as if they were requests for normal grace periods. If both rcu_expedited and rcu_normal are specified, rcu_normal wins. This means that if you are relying on expedited grace periods to speed up boot, you will want to specify rcu_expedited on the kernel command line, and then specify rcu_normal via sysfs once boot completes. Signed-off-by: Paul E. McKenney --- Documentation/kernel-parameters.txt | 19 ++++++++++++++----- include/linux/rcupdate.h | 6 ++++++ kernel/ksysfs.c | 22 ++++++++++++++++++++-- kernel/rcu/srcu.c | 2 +- kernel/rcu/tree.c | 6 ++++++ kernel/rcu/tree_plugin.h | 6 ++++++ kernel/rcu/update.c | 12 ++++++++++++ 7 files changed, 65 insertions(+), 8 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 742f69d18fc8..7673943d3085 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3296,6 +3296,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. rcutorture.verbose= [KNL] Enable additional printk() statements. + rcupdate.rcu_cpu_stall_suppress= [KNL] + Suppress RCU CPU stall warning messages. + + rcupdate.rcu_cpu_stall_timeout= [KNL] + Set timeout for RCU CPU stall warning messages. + rcupdate.rcu_expedited= [KNL] Use expedited grace-period primitives, for example, synchronize_rcu_expedited() instead @@ -3303,11 +3309,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted. but can increase CPU utilization, degrade real-time latency, and degrade energy efficiency. - rcupdate.rcu_cpu_stall_suppress= [KNL] - Suppress RCU CPU stall warning messages. - - rcupdate.rcu_cpu_stall_timeout= [KNL] - Set timeout for RCU CPU stall warning messages. + rcupdate.rcu_normal= [KNL] + Use only normal grace-period primitives, + for example, synchronize_rcu() instead of + synchronize_rcu_expedited(). This improves + real-time latency, CPU utilization, and energy + efficiency, but can expose users to increased + grace-period latency. This parameter overrides + rcupdate.rcu_expedited. rcupdate.rcu_task_stall_timeout= [KNL] Set timeout in jiffies for RCU task stall warning diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index a0189ba67fde..98d9f30c02d4 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -49,9 +49,14 @@ #include extern int rcu_expedited; /* for sysctl */ +extern int rcu_normal; /* also for sysctl */ #ifdef CONFIG_TINY_RCU /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ +static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */ +{ + return true; +} static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */ { return false; @@ -65,6 +70,7 @@ static inline void rcu_unexpedite_gp(void) { } #else /* #ifdef CONFIG_TINY_RCU */ +bool rcu_gp_is_normal(void); /* Internal RCU use. */ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ void rcu_expedite_gp(void); void rcu_unexpedite_gp(void); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index e83b26464061..b4e2fa52d8bc 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -20,7 +20,7 @@ #include #include -#include /* rcu_expedited */ +#include /* rcu_expedited and rcu_normal */ #define KERNEL_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) @@ -148,7 +148,7 @@ int rcu_expedited; static ssize_t rcu_expedited_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", rcu_expedited); + return sprintf(buf, "%d\n", READ_ONCE(rcu_expedited)); } static ssize_t rcu_expedited_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -161,6 +161,23 @@ static ssize_t rcu_expedited_store(struct kobject *kobj, } KERNEL_ATTR_RW(rcu_expedited); +int rcu_normal; +static ssize_t rcu_normal_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", READ_ONCE(rcu_normal)); +} +static ssize_t rcu_normal_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (kstrtoint(buf, 0, &rcu_normal)) + return -EINVAL; + + return count; +} +KERNEL_ATTR_RW(rcu_normal); + /* * Make /sys/kernel/notes give the raw contents of our kernel .notes section. */ @@ -203,6 +220,7 @@ static struct attribute * kernel_attrs[] = { &vmcoreinfo_attr.attr, #endif &rcu_expedited_attr.attr, + &rcu_normal_attr.attr, NULL }; diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index a63a1ea5a41b..9b9cdd549caa 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -489,7 +489,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) */ void synchronize_srcu(struct srcu_struct *sp) { - __synchronize_srcu(sp, rcu_gp_is_expedited() + __synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal()) ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT : SYNCHRONIZE_SRCU_TRYCOUNT); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6a652d1f3d7f..489992997c06 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3841,6 +3841,12 @@ void synchronize_sched_expedited(void) if (rcu_blocking_is_gp()) return; + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu_sched); + return; + } + /* Take a snapshot of the sequence number. */ s = rcu_exp_gp_seq_snap(rsp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 57ba873d2f18..d45df3781551 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -746,6 +746,12 @@ void synchronize_rcu_expedited(void) struct rcu_state *rsp = rcu_state_p; unsigned long s; + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu); + return; + } + s = rcu_exp_gp_seq_snap(rsp); rnp_unlock = exp_funnel_lock(rsp, s); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 5f748c5a40f0..8fccda3a794d 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -61,6 +61,7 @@ MODULE_ALIAS("rcupdate"); #define MODULE_PARAM_PREFIX "rcupdate." module_param(rcu_expedited, int, 0); +module_param(rcu_normal, int, 0); #if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT) /** @@ -113,6 +114,17 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held); #ifndef CONFIG_TINY_RCU +/* + * Should expedited grace-period primitives always fall back to their + * non-expedited counterparts? Intended for use within RCU. Note + * that if the user specifies both rcu_expedited and rcu_normal, then + * rcu_normal wins. + */ +bool rcu_gp_is_normal(void) +{ + return READ_ONCE(rcu_normal); +} + static atomic_t rcu_expedited_nesting = ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0); -- cgit v1.2.3 From 967dcb8fe6a9a75be346400539261e0416baf370 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 Nov 2015 16:52:36 -0800 Subject: rcu: Wire up rcu_end_inkernel_boot() This commit adds the invocation of rcu_end_inkernel_boot() just before init is invoked. This allows the CONFIG_RCU_EXPEDITE_BOOT Kconfig option to do something useful and prepares for the upcoming rcupdate.rcu_normal_after_boot kernel parameter. Signed-off-by: Paul E. McKenney --- init/main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/init/main.c b/init/main.c index 9e64d7097f1a..c6ebefafa496 100644 --- a/init/main.c +++ b/init/main.c @@ -943,6 +943,8 @@ static int __ref kernel_init(void *unused) flush_delayed_fput(); + rcu_end_inkernel_boot(); + if (ramdisk_execute_command) { ret = run_init_process(ramdisk_execute_command); if (!ret) -- cgit v1.2.3 From 3e42ec1aa716f10c68294b8492ae3ea684528699 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 Nov 2015 18:56:00 -0800 Subject: rcu: Allow expedited grace periods to be disabled at init Expedited grace periods can speed up boot, but are undesirable in aggressive real-time systems. This commit therefore introduces a kernel parameter rcupdate.rcu_normal_after_boot that disables expedited grace periods just before init is spawned. Signed-off-by: Paul E. McKenney --- Documentation/kernel-parameters.txt | 5 +++++ kernel/rcu/update.c | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 7673943d3085..197305bbb9b7 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3318,6 +3318,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. grace-period latency. This parameter overrides rcupdate.rcu_expedited. + rcupdate.rcu_normal_after_boot= [KNL] + Once boot has completed (that is, after + rcu_end_inkernel_boot() has been invoked), use + only normal grace-period primitives. + rcupdate.rcu_task_stall_timeout= [KNL] Set timeout in jiffies for RCU task stall warning messages. Disable with a value less than or equal diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 8fccda3a794d..12b91f5a60a6 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -63,6 +63,9 @@ MODULE_ALIAS("rcupdate"); module_param(rcu_expedited, int, 0); module_param(rcu_normal, int, 0); +static int rcu_normal_after_boot; +module_param(rcu_normal_after_boot, int, 0); + #if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT) /** * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? @@ -178,6 +181,8 @@ void rcu_end_inkernel_boot(void) { if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT)) rcu_unexpedite_gp(); + if (rcu_normal_after_boot) + WRITE_ONCE(rcu_normal, 1); } #ifdef CONFIG_PREEMPT_RCU -- cgit v1.2.3 From 3dc5dbe9a1b815b659a6b04540fc6fd4b4e3831b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 26 Sep 2015 14:51:24 -0700 Subject: rcu: Move lock_class_key to local scope Currently, the rcu_node_class[], rcu_fqs_class[], and rcu_exp_class[] arrays needlessly pollute the global namespace within tree.c. This commit therefore converts them to static local variables within rcu_init_one(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 81aa1cdc6bc9..23df2661c899 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -68,10 +68,6 @@ MODULE_ALIAS("rcutree"); /* Data structures. */ -static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; -static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; -static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; - /* * In order to export the rcu_state name to the tracing tools, it * needs to be added in the __tracepoint_string section. @@ -4365,6 +4361,9 @@ static void __init rcu_init_one(struct rcu_state *rsp, static const char * const buf[] = RCU_NODE_NAME_INIT; static const char * const fqs[] = RCU_FQS_NAME_INIT; static const char * const exp[] = RCU_EXP_NAME_INIT; + static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; + static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; + static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; static u8 fl_mask = 0x1; int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ -- cgit v1.2.3 From 47dbc90663f697a4515a8dd5c99ae43dba108cb4 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 27 Sep 2015 19:14:57 -0400 Subject: kernel: Make rcu/tree_trace.c explicitly non-modular The Kconfig currently controlling compilation of this code is: init/Kconfig:config TREE_RCU_TRACE init/Kconfig: def_bool RCU_TRACE && ( TREE_RCU || PREEMPT_RCU ) ...meaning that it currently is not being built as a module by anyone. Lets remove the modular code that is essentially orphaned, so that when reading the file there is no doubt it is builtin-only. Since module_init translates to device_initcall in the non-modular case, the init ordering remains unchanged with this commit. We could consider moving this to an earlier initcall if desired. We don't replace module.h with init.h since the file already has that. We also delete the moduleparam.h include that is left over from commit 64db4cfff99c04cd5f550357edcc8780f96b54a2 (""Tree RCU": scalable classic RCU implementation") since it is not needed here either. We morph some tags like MODULE_AUTHOR into the comments at the top of the file for documentation purposes. Cc: "Paul E. McKenney" Cc: Josh Triplett Reviewed-by: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Signed-off-by: Paul Gortmaker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_trace.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 8efaba870d96..82aca98b18f8 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -1,5 +1,5 @@ /* - * Read-Copy Update tracing for classic implementation + * Read-Copy Update tracing for hierarchical implementation. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -16,6 +16,7 @@ * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2008 + * Author: Paul E. McKenney * * Papers: http://www.rdrop.com/users/paulmck/RCU * @@ -33,9 +34,7 @@ #include #include #include -#include #include -#include #include #include #include @@ -487,16 +486,4 @@ free_out: debugfs_remove_recursive(rcudir); return 1; } - -static void __exit rcutree_trace_cleanup(void) -{ - debugfs_remove_recursive(rcudir); -} - - -module_init(rcutree_trace_init); -module_exit(rcutree_trace_cleanup); - -MODULE_AUTHOR("Paul E. McKenney"); -MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); -MODULE_LICENSE("GPL"); +device_initcall(rcutree_trace_init); -- cgit v1.2.3 From fecbf6f01fbd83e6419ccb7f61d9a6eb987f1d92 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Sep 2015 18:19:24 -0700 Subject: rcu: Simplify rcu_sched_qs() control flow This commit applies an early-exit approach to rcu_sched_qs(), reducing the nesting level and saving a line of code. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 23df2661c899..ed3bc0578cc5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -244,22 +244,21 @@ void rcu_sched_qs(void) { unsigned long flags; - if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) { - trace_rcu_grace_period(TPS("rcu_sched"), - __this_cpu_read(rcu_sched_data.gpnum), - TPS("cpuqs")); - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); - if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) - return; - local_irq_save(flags); - if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) { - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(&rcu_sched_data), - true); - } - local_irq_restore(flags); + if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) + return; + trace_rcu_grace_period(TPS("rcu_sched"), + __this_cpu_read(rcu_sched_data.gpnum), + TPS("cpuqs")); + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); + if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) + return; + local_irq_save(flags); + if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) { + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(&rcu_sched_data), true); } + local_irq_restore(flags); } void rcu_bh_qs(void) -- cgit v1.2.3 From 8ba9153b2c3ab733d64e22adb57820ccb6afc496 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Sep 2015 07:55:41 -0700 Subject: rcu: Remove lock-acquisition loop from rcu_read_unlock_special() Several releases have come and gone without the warning triggering, so remove the lock-acquisition loop. Retain the WARN_ON_ONCE() out of sheer paranoia. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 57ba873d2f18..ae4ce2b665f8 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -449,19 +449,13 @@ void rcu_read_unlock_special(struct task_struct *t) /* * Remove this task from the list it blocked on. The task - * now remains queued on the rcu_node corresponding to - * the CPU it first blocked on, so the first attempt to - * acquire the task's rcu_node's ->lock will succeed. - * Keep the loop and add a WARN_ON() out of sheer paranoia. + * now remains queued on the rcu_node corresponding to the + * CPU it first blocked on, so there is no longer any need + * to loop. Retain a WARN_ON_ONCE() out of sheer paranoia. */ - for (;;) { - rnp = t->rcu_blocked_node; - raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - if (rnp == t->rcu_blocked_node) - break; - WARN_ON_ONCE(1); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - } + rnp = t->rcu_blocked_node; + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ + WARN_ON_ONCE(rnp != t->rcu_blocked_node); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ -- cgit v1.2.3 From 699d40352059e64a4d993af170272585c41988d0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Sep 2015 08:47:49 -0700 Subject: rcu: Fix obsolete rcu_bootup_announce_oddness() comment This function no longer has #ifdefs, so this commit removes the header comment calling them out. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ae4ce2b665f8..42df93721e6f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -63,8 +63,7 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ /* * Check the RCU kernel configuration parameters and print informative - * messages about anything out of the ordinary. If you like #ifdef, you - * will love this function. + * messages about anything out of the ordinary. */ static void __init rcu_bootup_announce_oddness(void) { -- cgit v1.2.3 From f0f2e7d307fff226e0c1df5a07101a1216a46d8a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Sep 2015 08:59:32 -0700 Subject: rcu: Avoid tick_nohz_active checks on NOCBs CPUs Currently, rcu_prepare_for_idle() checks for tick_nohz_active, even on individual NOCBs CPUs, unless all CPUs are marked as NOCBs CPUs at build time. This check is pointless on NOCBs CPUs because they never have any callbacks posted, given that all of their callbacks are handed off to the corresponding rcuo kthread. There is a check for individually designated NOCBs CPUs, but it pointelessly follows the check for tick_nohz_active. This commit therefore moves the check for individually designated NOCBs CPUs up with the check for CONFIG_RCU_NOCB_CPU_ALL. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 42df93721e6f..8e9d4a4d0326 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1513,7 +1513,8 @@ static void rcu_prepare_for_idle(void) struct rcu_state *rsp; int tne; - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || + rcu_is_nocb_cpu(smp_processor_id())) return; /* Handle nohz enablement switches conservatively. */ @@ -1527,10 +1528,6 @@ static void rcu_prepare_for_idle(void) if (!tne) return; - /* If this is a no-CBs CPU, no callbacks, just return. */ - if (rcu_is_nocb_cpu(smp_processor_id())) - return; - /* * If a non-lazy callback arrived at a CPU having only lazy * callbacks, invoke RCU core for the side-effect of recalculating -- cgit v1.2.3 From 46a5d164db53ba6066b11889abb7fa6bddbe5cf7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Oct 2015 09:10:48 -0700 Subject: rcu: Stop disabling interrupts in scheduler fastpaths We need the scheduler's fastpaths to be, well, fast, and unnecessarily disabling and re-enabling interrupts is not necessarily consistent with this goal. Especially given that there are regions of the scheduler that already have interrupts disabled. This commit therefore moves the call to rcu_note_context_switch() to one of the interrupts-disabled regions of the scheduler, and removes the now-redundant disabling and re-enabling of interrupts from rcu_note_context_switch() and the functions it calls. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney [ paulmck: Shift rcu_note_context_switch() to avoid deadlock, as suggested by Peter Zijlstra. ] --- include/linux/rcutree.h | 2 +- kernel/rcu/tree.c | 27 ++++++++++++--------------- kernel/rcu/tree_plugin.h | 14 ++++++-------- kernel/sched/core.c | 6 ++++-- 4 files changed, 23 insertions(+), 26 deletions(-) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 60d15a080d7c..9d3eda39bcd2 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -37,7 +37,7 @@ void rcu_cpu_stall_reset(void); /* * Note a virtualization-based context switch. This is simply a * wrapper around rcu_note_context_switch(), which allows TINY_RCU - * to save a few bytes. + * to save a few bytes. The caller must have disabled interrupts. */ static inline void rcu_virt_note_context_switch(int cpu) { diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ed3bc0578cc5..93941d3434ad 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -242,8 +242,6 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) */ void rcu_sched_qs(void) { - unsigned long flags; - if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) return; trace_rcu_grace_period(TPS("rcu_sched"), @@ -252,13 +250,9 @@ void rcu_sched_qs(void) __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) return; - local_irq_save(flags); - if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) { - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(&rcu_sched_data), true); - } - local_irq_restore(flags); + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(&rcu_sched_data), true); } void rcu_bh_qs(void) @@ -295,17 +289,16 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); * We inform the RCU core by emulating a zero-duration dyntick-idle * period, which we in turn do by incrementing the ->dynticks counter * by two. + * + * The caller must have disabled interrupts. */ static void rcu_momentary_dyntick_idle(void) { - unsigned long flags; struct rcu_data *rdp; struct rcu_dynticks *rdtp; int resched_mask; struct rcu_state *rsp; - local_irq_save(flags); - /* * Yes, we can lose flag-setting operations. This is OK, because * the flag will be set again after some delay. @@ -335,13 +328,12 @@ static void rcu_momentary_dyntick_idle(void) smp_mb__after_atomic(); /* Later stuff after QS. */ break; } - local_irq_restore(flags); } /* * Note a context switch. This is a quiescent state for RCU-sched, * and requires special handling for preemptible RCU. - * The caller must have disabled preemption. + * The caller must have disabled interrupts. */ void rcu_note_context_switch(void) { @@ -371,9 +363,14 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); */ void rcu_all_qs(void) { + unsigned long flags; + barrier(); /* Avoid RCU read-side critical sections leaking down. */ - if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) + if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) { + local_irq_save(flags); rcu_momentary_dyntick_idle(); + local_irq_restore(flags); + } this_cpu_inc(rcu_qs_ctr); barrier(); /* Avoid RCU read-side critical sections leaking up. */ } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 8e9d4a4d0326..e6da888cc908 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -146,8 +146,8 @@ static void __init rcu_bootup_announce(void) * the corresponding expedited grace period will also be the end of the * normal grace period. */ -static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long flags) __releases(rnp->lock) +static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) + __releases(rnp->lock) /* But leaves rrupts disabled. */ { int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) + (rnp->exp_tasks ? RCU_EXP_TASKS : 0) + @@ -235,7 +235,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, rnp->gp_tasks = &t->rcu_node_entry; if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) rnp->exp_tasks = &t->rcu_node_entry; - raw_spin_unlock(&rnp->lock); + raw_spin_unlock(&rnp->lock); /* rrupts remain disabled. */ /* * Report the quiescent state for the expedited GP. This expedited @@ -250,7 +250,6 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, } else { WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs); } - local_irq_restore(flags); } /* @@ -285,12 +284,11 @@ static void rcu_preempt_qs(void) * predating the current grace period drain, in other words, until * rnp->gp_tasks becomes NULL. * - * Caller must disable preemption. + * Caller must disable interrupts. */ static void rcu_preempt_note_context_switch(void) { struct task_struct *t = current; - unsigned long flags; struct rcu_data *rdp; struct rcu_node *rnp; @@ -300,7 +298,7 @@ static void rcu_preempt_note_context_switch(void) /* Possibly blocking in an RCU read-side critical section. */ rdp = this_cpu_ptr(rcu_state_p->rda); rnp = rdp->mynode; - raw_spin_lock_irqsave_rcu_node(rnp, flags); + raw_spin_lock_rcu_node(rnp); t->rcu_read_unlock_special.b.blocked = true; t->rcu_blocked_node = rnp; @@ -316,7 +314,7 @@ static void rcu_preempt_note_context_switch(void) (rnp->qsmask & rdp->grpmask) ? rnp->gpnum : rnp->gpnum + 1); - rcu_preempt_ctxt_queue(rnp, rdp, flags); + rcu_preempt_ctxt_queue(rnp, rdp); } else if (t->rcu_read_lock_nesting < 0 && t->rcu_read_unlock_special.s) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4d568ac9319e..ec72de234feb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3085,7 +3085,6 @@ static void __sched notrace __schedule(bool preempt) cpu = smp_processor_id(); rq = cpu_rq(cpu); - rcu_note_context_switch(); prev = rq->curr; /* @@ -3104,13 +3103,16 @@ static void __sched notrace __schedule(bool preempt) if (sched_feat(HRTICK)) hrtick_clear(rq); + local_irq_disable(); + rcu_note_context_switch(); + /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) * done by the caller to avoid the race with signal_wake_up(). */ smp_mb__before_spinlock(); - raw_spin_lock_irq(&rq->lock); + raw_spin_lock(&rq->lock); lockdep_pin_lock(&rq->lock); rq->clock_skip_update <<= 1; /* promote REQ to ACT */ -- cgit v1.2.3 From 7d86dccf28a3ae2f790f399fc82d4c82521fd078 Mon Sep 17 00:00:00 2001 From: Petko Manolov Date: Mon, 12 Oct 2015 18:23:51 +0300 Subject: list: Introduces generic list_splice_tail_init_rcu() The list_splice_init_rcu() can be used as a stack onto which full lists are pushed, but queue-like behavior is now needed by some security policies. This requires a list_splice_tail_init_rcu(). This commit therefore supplies a list_splice_tail_init_rcu() by pulling code common it and to list_splice_init_rcu() into a new __list_splice_init_rcu() function. This new function is based on the existing list_splice_init_rcu() implementation. Signed-off-by: Petko Manolov Cc: Mimi Zohar Signed-off-by: Paul E. McKenney --- include/linux/rculist.h | 69 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 20 deletions(-) diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 5ed540986019..e99d834545b6 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -179,32 +179,31 @@ static inline void list_replace_rcu(struct list_head *old, } /** - * list_splice_init_rcu - splice an RCU-protected list into an existing list. + * __list_splice_init_rcu - join an RCU-protected list into an existing list. * @list: the RCU-protected list to splice - * @head: the place in the list to splice the first list into + * @prev: points to the last element of the existing list + * @next: points to the first element of the existing list * @sync: function to sync: synchronize_rcu(), synchronize_sched(), ... * - * @head can be RCU-read traversed concurrently with this function. + * The list pointed to by @prev and @next can be RCU-read traversed + * concurrently with this function. * * Note that this function blocks. * - * Important note: the caller must take whatever action is necessary to - * prevent any other updates to @head. In principle, it is possible - * to modify the list as soon as sync() begins execution. - * If this sort of thing becomes necessary, an alternative version - * based on call_rcu() could be created. But only if -really- - * needed -- there is no shortage of RCU API members. + * Important note: the caller must take whatever action is necessary to prevent + * any other updates to the existing list. In principle, it is possible to + * modify the list as soon as sync() begins execution. If this sort of thing + * becomes necessary, an alternative version based on call_rcu() could be + * created. But only if -really- needed -- there is no shortage of RCU API + * members. */ -static inline void list_splice_init_rcu(struct list_head *list, - struct list_head *head, - void (*sync)(void)) +static inline void __list_splice_init_rcu(struct list_head *list, + struct list_head *prev, + struct list_head *next, + void (*sync)(void)) { struct list_head *first = list->next; struct list_head *last = list->prev; - struct list_head *at = head->next; - - if (list_empty(list)) - return; /* * "first" and "last" tracking list, so initialize it. RCU readers @@ -231,10 +230,40 @@ static inline void list_splice_init_rcu(struct list_head *list, * this function. */ - last->next = at; - rcu_assign_pointer(list_next_rcu(head), first); - first->prev = head; - at->prev = last; + last->next = next; + rcu_assign_pointer(list_next_rcu(prev), first); + first->prev = prev; + next->prev = last; +} + +/** + * list_splice_init_rcu - splice an RCU-protected list into an existing list, + * designed for stacks. + * @list: the RCU-protected list to splice + * @head: the place in the existing list to splice the first list into + * @sync: function to sync: synchronize_rcu(), synchronize_sched(), ... + */ +static inline void list_splice_init_rcu(struct list_head *list, + struct list_head *head, + void (*sync)(void)) +{ + if (!list_empty(list)) + __list_splice_init_rcu(list, head, head->next, sync); +} + +/** + * list_splice_tail_init_rcu - splice an RCU-protected list into an existing + * list, designed for queues. + * @list: the RCU-protected list to splice + * @head: the place in the existing list to splice the first list into + * @sync: function to sync: synchronize_rcu(), synchronize_sched(), ... + */ +static inline void list_splice_tail_init_rcu(struct list_head *list, + struct list_head *head, + void (*sync)(void)) +{ + if (!list_empty(list)) + __list_splice_init_rcu(list, head->prev, head, sync); } /** -- cgit v1.2.3 From 2f073848c3cc8aff2655ab7c46d8c0de90cf4e50 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 12 Oct 2015 16:56:42 -0700 Subject: list: Use WRITE_ONCE() when initializing list_head structures Code that does lockless emptiness testing of non-RCU lists is relying on INIT_LIST_HEAD() to write the list head's ->next pointer atomically, particularly when INIT_LIST_HEAD() is invoked from list_del_init(). This commit therefore adds WRITE_ONCE() to this function's pointer stores that could affect the head's ->next pointer. Reported-by: Andrey Konovalov Signed-off-by: Paul E. McKenney --- include/linux/list.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/list.h b/include/linux/list.h index 06c2d887a918..5356f4d661a7 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -24,7 +24,7 @@ static inline void INIT_LIST_HEAD(struct list_head *list) { - list->next = list; + WRITE_ONCE(list->next, list); list->prev = list; } -- cgit v1.2.3 From 649e4368ff786e3d02eb2a06b1493fb217d74408 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Oct 2015 13:32:08 -0700 Subject: documentation: Record RCU requirements This commit adds RCU requirements as published in a 2015 LWN series. Bringing these requirements in-tree allows them to be updated as changes are discovered. Signed-off-by: Paul E. McKenney [ paulmck: Updates to charset and URLs as suggested by Josh Triplett. ] --- .../RCU/Design/Requirements/2013-08-is-it-dead.png | Bin 0 -> 100825 bytes .../Design/Requirements/GPpartitionReaders1.svg | 374 +++ .../RCU/Design/Requirements/RCUApplicability.svg | 237 ++ .../Design/Requirements/ReadersPartitionGP1.svg | 639 +++++ .../RCU/Design/Requirements/Requirements.html | 2799 ++++++++++++++++++++ .../RCU/Design/Requirements/Requirements.htmlx | 2643 ++++++++++++++++++ Documentation/RCU/Design/htmlqqz.sh | 108 + 7 files changed, 6800 insertions(+) create mode 100644 Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png create mode 100644 Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg create mode 100644 Documentation/RCU/Design/Requirements/RCUApplicability.svg create mode 100644 Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg create mode 100644 Documentation/RCU/Design/Requirements/Requirements.html create mode 100644 Documentation/RCU/Design/Requirements/Requirements.htmlx create mode 100755 Documentation/RCU/Design/htmlqqz.sh diff --git a/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png b/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png new file mode 100644 index 000000000000..7496a55e4e7b Binary files /dev/null and b/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png differ diff --git a/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg b/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg new file mode 100644 index 000000000000..4b4014fda770 --- /dev/null +++ b/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg @@ -0,0 +1,374 @@ + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + synchronize_rcu() + + + + + + + WRITE_ONCE(a, 1); + WRITE_ONCE(b, 1); + r1 = READ_ONCE(a); + WRITE_ONCE(c, 1); + r2 = READ_ONCE(b); + r3 = READ_ONCE(c); + thread0() + thread1() + thread2() + + + + rcu_read_lock(); + rcu_read_lock(); + rcu_read_unlock(); + rcu_read_unlock(); + + QS + + QS + + + QS + + diff --git a/Documentation/RCU/Design/Requirements/RCUApplicability.svg b/Documentation/RCU/Design/Requirements/RCUApplicability.svg new file mode 100644 index 000000000000..ebcbeee391ed --- /dev/null +++ b/Documentation/RCU/Design/Requirements/RCUApplicability.svg @@ -0,0 +1,237 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + Read-Mostly, Stale & + + Inconsistent Data OK + + (RCU Works Great!!!) + + (RCU Works Well) + + Read-Mostly, Need Consistent Data + + Read-Write, Need Consistent Data + + Update-Mostly, Need Consistent Data + + (RCU Might Be OK...) + + (1) Provide Existence Guarantees For Update-Friendly Mechanisms + + (2) Provide Wait-Free Read-Side Primitives for Real-Time Use) + + (RCU is Very Unlikely to be the Right Tool For The Job, But it Can: + + diff --git a/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg b/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg new file mode 100644 index 000000000000..48cd1623d4d4 --- /dev/null +++ b/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg @@ -0,0 +1,639 @@ + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + synchronize_rcu() + + + + + + + WRITE_ONCE(a, 1); + WRITE_ONCE(b, 1); + r1 = READ_ONCE(a); + WRITE_ONCE(c, 1); + WRITE_ONCE(d, 1); + r2 = READ_ONCE(c); + thread0() + thread1() + thread2() + + + + rcu_read_lock(); + rcu_read_lock(); + rcu_read_unlock(); + rcu_read_unlock(); + + QS + + QS + + + QS + + + + synchronize_rcu() + + + + + + + r3 = READ_ONCE(d); + WRITE_ONCE(e, 1); + + QS + r4 = READ_ONCE(b); + r5 = READ_ONCE(e); + rcu_read_lock(); + rcu_read_unlock(); + QS + + QS + + QS + + thread3() + thread4() + + diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html new file mode 100644 index 000000000000..36de7aaa941e --- /dev/null +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -0,0 +1,2799 @@ + + + + + A Tour Through RCU's Requirements [LWN.net] + + +

A Tour Through RCU's Requirements

+ +

Copyright IBM Corporation, 2015

+

Author: Paul E. McKenney

+

The initial version of this document appeared in the +LWN articles +here, +here, and +here.

+ +

Introduction

+ +

+Read-copy update (RCU) is a synchronization mechanism that is often +used as a replacement for reader-writer locking. +RCU is unusual in that updaters do not block readers, +which means that RCU's read-side primitives can be exceedingly fast +and scalable. +In addition, updaters can make useful forward progress concurrently +with readers. +However, all this concurrency between RCU readers and updaters does raise +the question of exactly what RCU readers are doing, which in turn +raises the question of exactly what RCU's requirements are. + +

+This document therefore summarizes RCU's requirements, and can be thought +of as an informal, high-level specification for RCU. +It is important to understand that RCU's specification is primarily +empirical in nature; +in fact, I learned about many of these requirements the hard way. +This situation might cause some consternation, however, not only +has this learning process been a lot of fun, but it has also been +a great privilege to work with so many people willing to apply +technologies in interesting new ways. + +

+All that aside, here are the categories of currently known RCU requirements: +

+ +
    +
  1. + Fundamental Requirements +
  2. Fundamental Non-Requirements +
  3. + Parallelism Facts of Life +
  4. + Quality-of-Implementation Requirements +
  5. + Linux Kernel Complications +
  6. + Software-Engineering Requirements +
  7. + Other RCU Flavors +
  8. + Possible Future Changes +
+ +

+This is followed by a summary, +which is in turn followed by the inevitable +answers to the quick quizzes. + +

Fundamental Requirements

+ +

+RCU's fundamental requirements are the closest thing RCU has to hard +mathematical requirements. +These are: + +

    +
  1. + Grace-Period Guarantee +
  2. + Publish-Subscribe Guarantee +
  3. + RCU Primitives Guaranteed to Execute Unconditionally +
  4. + Guaranteed Read-to-Write Upgrade +
+ +

Grace-Period Guarantee

+ +

+RCU's grace-period guarantee is unusual in being premeditated: +Jack Slingwine and I had this guarantee firmly in mind when we started +work on RCU (then called “rclock”) in the early 1990s. +That said, the past two decades of experience with RCU have produced +a much more detailed understanding of this guarantee. + +

+RCU's grace-period guarantee allows updaters to wait for the completion +of all pre-existing RCU read-side critical sections. +An RCU read-side critical section +begins with the marker rcu_read_lock() and ends with +the marker rcu_read_unlock(). +These markers may be nested, and RCU treats a nested set as one +big RCU read-side critical section. +Production-quality implementations of rcu_read_lock() and +rcu_read_unlock() are extremely lightweight, and in +fact have exactly zero overhead in Linux kernels built for production +use with CONFIG_PREEMPT=n. + +

+This guarantee allows ordering to be enforced with extremely low +overhead to readers, for example: + +

+
+ 1 int x, y;
+ 2
+ 3 void thread0(void)
+ 4 {
+ 5   rcu_read_lock();
+ 6   r1 = READ_ONCE(x);
+ 7   r2 = READ_ONCE(y);
+ 8   rcu_read_unlock();
+ 9 }
+10
+11 void thread1(void)
+12 {
+13   WRITE_ONCE(x, 1);
+14   synchronize_rcu();
+15   WRITE_ONCE(y, 1);
+16 }
+
+
+ +

+Because the synchronize_rcu() on line 14 waits for +all pre-existing readers, any instance of thread0() that +loads a value of zero from x must complete before +thread1() stores to y, so that instance must +also load a value of zero from y. +Similarly, any instance of thread0() that loads a value of +one from y must have started after the +synchronize_rcu() started, and must therefore also load +a value of one from x. +Therefore, the outcome: +

+
+(r1 == 0 && r2 == 1)
+
+
+cannot happen. + +

Quick Quiz 1: +Wait a minute! +You said that updaters can make useful forward progress concurrently +with readers, but pre-existing readers will block +synchronize_rcu()!!! +Just who are you trying to fool??? +
Answer + +

+This scenario resembles one of the first uses of RCU in +DYNIX/ptx, +which managed a distributed lock manager's transition into +a state suitable for handling recovery from node failure, +more or less as follows: + +

+
+ 1 #define STATE_NORMAL        0
+ 2 #define STATE_WANT_RECOVERY 1
+ 3 #define STATE_RECOVERING    2
+ 4 #define STATE_WANT_NORMAL   3
+ 5
+ 6 int state = STATE_NORMAL;
+ 7
+ 8 void do_something_dlm(void)
+ 9 {
+10   int state_snap;
+11
+12   rcu_read_lock();
+13   state_snap = READ_ONCE(state);
+14   if (state_snap == STATE_NORMAL)
+15     do_something();
+16   else
+17     do_something_carefully();
+18   rcu_read_unlock();
+19 }
+20
+21 void start_recovery(void)
+22 {
+23   WRITE_ONCE(state, STATE_WANT_RECOVERY);
+24   synchronize_rcu();
+25   WRITE_ONCE(state, STATE_RECOVERING);
+26   recovery();
+27   WRITE_ONCE(state, STATE_WANT_NORMAL);
+28   synchronize_rcu();
+29   WRITE_ONCE(state, STATE_NORMAL);
+30 }
+
+
+ +

+The RCU read-side critical section in do_something_dlm() +works with the synchronize_rcu() in start_recovery() +to guarantee that do_something() never runs concurrently +with recovery(), but with little or no synchronization +overhead in do_something_dlm(). + +

Quick Quiz 2: +Why is the synchronize_rcu() on line 28 needed? +
Answer + +

+In order to avoid fatal problems such as deadlocks, +an RCU read-side critical section must not contain calls to +synchronize_rcu(). +Similarly, an RCU read-side critical section must not +contain anything that waits, directly or indirectly, on completion of +an invocation of synchronize_rcu(). + +

+Although RCU's grace-period guarantee is useful in and of itself, with +quite a few use cases, +it would be good to be able to use RCU to coordinate read-side +access to linked data structures. +For this, the grace-period guarantee is not sufficient, as can +be seen in function add_gp_buggy() below. +We will look at the reader's code later, but in the meantime, just think of +the reader as locklessly picking up the gp pointer, +and, if the value loaded is non-NULL, locklessly accessing the +->a and ->b fields. + +

+
+ 1 bool add_gp_buggy(int a, int b)
+ 2 {
+ 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
+ 4   if (!p)
+ 5     return -ENOMEM;
+ 6   spin_lock(&gp_lock);
+ 7   if (rcu_access_pointer(gp)) {
+ 8     spin_unlock(&gp_lock);
+ 9     return false;
+10   }
+11   p->a = a;
+12   p->b = a;
+13   gp = p; /* ORDERING BUG */
+14   spin_unlock(&gp_lock);
+15   return true;
+16 }
+
+
+ +

+The problem is that both the compiler and weakly ordered CPUs are within +their rights to reorder this code as follows: + +

+
+ 1 bool add_gp_buggy_optimized(int a, int b)
+ 2 {
+ 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
+ 4   if (!p)
+ 5     return -ENOMEM;
+ 6   spin_lock(&gp_lock);
+ 7   if (rcu_access_pointer(gp)) {
+ 8     spin_unlock(&gp_lock);
+ 9     return false;
+10   }
+11   gp = p; /* ORDERING BUG */
+12   p->a = a;
+13   p->b = a;
+14   spin_unlock(&gp_lock);
+15   return true;
+16 }
+
+
+ +

+If an RCU reader fetches gp just after +add_gp_buggy_optimized executes line 11, +it will see garbage in the ->a and ->b +fields. +And this is but one of many ways in which compiler and hardware optimizations +could cause trouble. +Therefore, we clearly need some way to prevent the compiler and the CPU from +reordering in this manner, which brings us to the publish-subscribe +guarantee discussed in the next section. + +

Publish/Subscribe Guarantee

+ +

+RCU's publish-subscribe guarantee allows data to be inserted +into a linked data structure without disrupting RCU readers. +The updater uses rcu_assign_pointer() to insert the +new data, and readers use rcu_dereference() to +access data, whether new or old. +The following shows an example of insertion: + +

+
+ 1 bool add_gp(int a, int b)
+ 2 {
+ 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
+ 4   if (!p)
+ 5     return -ENOMEM;
+ 6   spin_lock(&gp_lock);
+ 7   if (rcu_access_pointer(gp)) {
+ 8     spin_unlock(&gp_lock);
+ 9     return false;
+10   }
+11   p->a = a;
+12   p->b = a;
+13   rcu_assign_pointer(gp, p);
+14   spin_unlock(&gp_lock);
+15   return true;
+16 }
+
+
+ +

+The rcu_assign_pointer() on line 13 is conceptually +equivalent to a simple assignment statement, but also guarantees +that its assignment will +happen after the two assignments in lines 11 and 12, +similar to the C11 memory_order_release store operation. +It also prevents any number of “interesting” compiler +optimizations, for example, the use of gp as a scratch +location immediately preceding the assignment. + +

Quick Quiz 3: +But rcu_assign_pointer() does nothing to prevent the +two assignments to p->a and p->b +from being reordered. +Can't that also cause problems? +
Answer + +

+It is tempting to assume that the reader need not do anything special +to control its accesses to the RCU-protected data, +as shown in do_something_gp_buggy() below: + +

+
+ 1 bool do_something_gp_buggy(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   p = gp;  /* OPTIMIZATIONS GALORE!!! */
+ 5   if (p) {
+ 6     do_something(p->a, p->b);
+ 7     rcu_read_unlock();
+ 8     return true;
+ 9   }
+10   rcu_read_unlock();
+11   return false;
+12 }
+
+
+ +

+However, this temptation must be resisted because there are a +surprisingly large number of ways that the compiler +(to say nothing of +DEC Alpha CPUs) +can trip this code up. +For but one example, if the compiler were short of registers, it +might choose to refetch from gp rather than keeping +a separate copy in p as follows: + +

+
+ 1 bool do_something_gp_buggy_optimized(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   if (gp) { /* OPTIMIZATIONS GALORE!!! */
+ 5     do_something(gp->a, gp->b);
+ 6     rcu_read_unlock();
+ 7     return true;
+ 8   }
+ 9   rcu_read_unlock();
+10   return false;
+11 }
+
+
+ +

+If this function ran concurrently with a series of updates that +replaced the current structure with a new one, +the fetches of gp->a +and gp->b might well come from two different structures, +which could cause serious confusion. +To prevent this (and much else besides), do_something_gp() uses +rcu_dereference() to fetch from gp: + +

+
+ 1 bool do_something_gp(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   p = rcu_dereference(gp);
+ 5   if (p) {
+ 6     do_something(p->a, p->b);
+ 7     rcu_read_unlock();
+ 8     return true;
+ 9   }
+10   rcu_read_unlock();
+11   return false;
+12 }
+
+
+ +

+The rcu_dereference() uses volatile casts and (for DEC Alpha) +memory barriers in the Linux kernel. +Should a +high-quality implementation of C11 memory_order_consume [PDF] +ever appear, then rcu_dereference() could be implemented +as a memory_order_consume load. +Regardless of the exact implementation, a pointer fetched by +rcu_dereference() may not be used outside of the +outermost RCU read-side critical section containing that +rcu_dereference(), unless protection of +the corresponding data element has been passed from RCU to some +other synchronization mechanism, most commonly locking or +reference counting. + +

+In short, updaters use rcu_assign_pointer() and readers +use rcu_dereference(), and these two RCU API elements +work together to ensure that readers have a consistent view of +newly added data elements. + +

+Of course, it is also necessary to remove elements from RCU-protected +data structures, for example, using the following process: + +

    +
  1. Remove the data element from the enclosing structure. +
  2. Wait for all pre-existing RCU read-side critical sections + to complete (because only pre-existing readers can possibly have + a reference to the newly removed data element). +
  3. At this point, only the updater has a reference to the + newly removed data element, so it can safely reclaim + the data element, for example, by passing it to kfree(). +
+ +This process is implemented by remove_gp_synchronous(): + +
+
+ 1 bool remove_gp_synchronous(void)
+ 2 {
+ 3   struct foo *p;
+ 4
+ 5   spin_lock(&gp_lock);
+ 6   p = rcu_access_pointer(gp);
+ 7   if (!p) {
+ 8     spin_unlock(&gp_lock);
+ 9     return false;
+10   }
+11   rcu_assign_pointer(gp, NULL);
+12   spin_unlock(&gp_lock);
+13   synchronize_rcu();
+14   kfree(p);
+15   return true;
+16 }
+
+
+ +

+This function is straightforward, with line 13 waiting for a grace +period before line 14 frees the old data element. +This waiting ensures that readers will reach line 7 of +do_something_gp() before the data element referenced by +p is freed. +The rcu_access_pointer() on line 6 is similar to +rcu_dereference(), except that: + +

    +
  1. The value returned by rcu_access_pointer() + cannot be dereferenced. + If you want to access the value pointed to as well as + the pointer itself, use rcu_dereference() + instead of rcu_access_pointer(). +
  2. The call to rcu_access_pointer() need not be + protected. + In contrast, rcu_dereference() must either be + within an RCU read-side critical section or in a code + segment where the pointer cannot change, for example, in + code protected by the corresponding update-side lock. +
+ +

Quick Quiz 4: +Without the rcu_dereference() or the +rcu_access_pointer(), what destructive optimizations +might the compiler make use of? +
Answer + +

+This simple linked-data-structure scenario clearly demonstrates the need +for RCU's stringent memory-ordering guarantees on systems with more than +one CPU: + +

    +
  1. Each CPU that has an RCU read-side critical section that + begins before synchronize_rcu() starts is + guaranteed to execute a full memory barrier between the time + that the RCU read-side critical section ends and the time that + synchronize_rcu() returns. + Without this guarantee, a pre-existing RCU read-side critical section + might hold a reference to the newly removed struct foo + after the kfree() on line 14 of + remove_gp_synchronous(). +
  2. Each CPU that has an RCU read-side critical section that ends + after synchronize_rcu() returns is guaranteed + to execute a full memory barrier between the time that + synchronize_rcu() begins and the time that the RCU + read-side critical section begins. + Without this guarantee, a later RCU read-side critical section + running after the kfree() on line 14 of + remove_gp_synchronous() might + later run do_something_gp() and find the + newly deleted struct foo. +
  3. If the task invoking synchronize_rcu() remains + on a given CPU, then that CPU is guaranteed to execute a full + memory barrier sometime during the execution of + synchronize_rcu(). + This guarantee ensures that the kfree() on + line 14 of remove_gp_synchronous() really does + execute after the removal on line 11. +
  4. If the task invoking synchronize_rcu() migrates + among a group of CPUs during that invocation, then each of the + CPUs in that group is guaranteed to execute a full memory barrier + sometime during the execution of synchronize_rcu(). + This guarantee also ensures that the kfree() on + line 14 of remove_gp_synchronous() really does + execute after the removal on + line 11, but also in the case where the thread executing the + synchronize_rcu() migrates in the meantime. +
+ +

Quick Quiz 5: +Given that multiple CPUs can start RCU read-side critical sections +at any time without any ordering whatsoever, how can RCU possibly tell whether +or not a given RCU read-side critical section starts before a +given instance of synchronize_rcu()? +
Answer + +

Quick Quiz 6: +The first and second guarantees require unbelievably strict ordering! +Are all these memory barriers really required? +
Answer + +

+In short, RCU's publish-subscribe guarantee is provided by the combination +of rcu_assign_pointer() and rcu_dereference(). +This guarantee allows data elements to be safely added to RCU-protected +linked data structures without disrupting RCU readers. +This guarantee can be used in combination with the grace-period +guarantee to also allow data elements to be removed from RCU-protected +linked data structures, again without disrupting RCU readers. + +

+This guarantee was only partially premeditated. +DYNIX/ptx used an explicit memory barrier for publication, but had nothing +resembling rcu_dereference() for subscription, nor did it +have anything resembling the smp_read_barrier_depends() +that was later subsumed into rcu_dereference(). +The need for these operations made itself known quite suddenly at a +late-1990s meeting with the DEC Alpha architects, back in the days when +DEC was still a free-standing company. +It took the Alpha architects a good hour to convince me that any sort +of barrier would ever be needed, and it then took me a good two hours +to convince them that their documentation did not make this point clear. +More recent work with the C and C++ standards committees have provided +much education on tricks and traps from the compiler. +In short, compilers were much less tricky in the early 1990s, but in +2015, don't even think about omitting rcu_dereference()! + +

RCU Primitives Guaranteed to Execute Unconditionally

+ +

+The common-case RCU primitives are unconditional. +They are invoked, they do their job, and they return, with no possibility +of error, and no need to retry. +This is a key RCU design philosophy. + +

+However, this philosophy is pragmatic rather than pigheaded. +If someone comes up with a good justification for a particular conditional +RCU primitive, it might well be implemented and added. +After all, this guarantee was reverse-engineered, not premeditated. +The unconditional nature of the RCU primitives was initially an +accident of implementation, and later experience with synchronization +primitives with conditional primitives caused me to elevate this +accident to a guarantee. +Therefore, the justification for adding a conditional primitive to +RCU would need to be based on detailed and compelling use cases. + +

Guaranteed Read-to-Write Upgrade

+ +

+As far as RCU is concerned, it is always possible to carry out an +update within an RCU read-side critical section. +For example, that RCU read-side critical section might search for +a given data element, and then might acquire the update-side +spinlock in order to update that element, all while remaining +in that RCU read-side critical section. +Of course, it is necessary to exit the RCU read-side critical section +before invoking synchronize_rcu(), however, this +inconvenience can be avoided through use of the +call_rcu() and kfree_rcu() API members +described later in this document. + +

Quick Quiz 7: +But how does the upgrade-to-write operation exclude other readers? +
Answer + +

+This guarantee allows lookup code to be shared between read-side +and update-side code, and was premeditated, appearing in the earliest +DYNIX/ptx RCU documentation. + +

Fundamental Non-Requirements

+ +

+RCU provides extremely lightweight readers, and its read-side guarantees, +though quite useful, are correspondingly lightweight. +It is therefore all too easy to assume that RCU is guaranteeing more +than it really is. +Of course, the list of things that RCU does not guarantee is infinitely +long, however, the following sections list a few non-guarantees that +have caused confusion. +Except where otherwise noted, these non-guarantees were premeditated. + +

    +
  1. + Readers Impose Minimal Ordering +
  2. + Readers Do Not Exclude Updaters +
  3. + Updaters Only Wait For Old Readers +
  4. + Grace Periods Don't Partition Read-Side Critical Sections +
  5. + Read-Side Critical Sections Don't Partition Grace Periods +
  6. + Disabling Preemption Does Not Block Grace Periods +
+ +

Readers Impose Minimal Ordering

+ +

+Reader-side markers such as rcu_read_lock() and +rcu_read_unlock() provide absolutely no ordering guarantees +except through their interaction with the grace-period APIs such as +synchronize_rcu(). +To see this, consider the following pair of threads: + +

+
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   WRITE_ONCE(x, 1);
+ 5   rcu_read_unlock();
+ 6   rcu_read_lock();
+ 7   WRITE_ONCE(y, 1);
+ 8   rcu_read_unlock();
+ 9 }
+10
+11 void thread1(void)
+12 {
+13   rcu_read_lock();
+14   r1 = READ_ONCE(y);
+15   rcu_read_unlock();
+16   rcu_read_lock();
+17   r2 = READ_ONCE(x);
+18   rcu_read_unlock();
+19 }
+
+
+ +

+After thread0() and thread1() execute +concurrently, it is quite possible to have + +

+
+(r1 == 1 && r2 == 0)
+
+
+ +(that is, y appears to have been assigned before x), +which would not be possible if rcu_read_lock() and +rcu_read_unlock() had much in the way of ordering +properties. +But they do not, so the CPU is within its rights +to do significant reordering. +This is by design: Any significant ordering constraints would slow down +these fast-path APIs. + +

Quick Quiz 8: +Can't the compiler also reorder this code? +
Answer + +

Readers Do Not Exclude Updaters

+ +

+Neither rcu_read_lock() nor rcu_read_unlock() +exclude updates. +All they do is to prevent grace periods from ending. +The following example illustrates this: + +

+
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   r1 = READ_ONCE(y);
+ 5   if (r1) {
+ 6     do_something_with_nonzero_x();
+ 7     r2 = READ_ONCE(x);
+ 8     WARN_ON(!r2); /* BUG!!! */
+ 9   }
+10   rcu_read_unlock();
+11 }
+12
+13 void thread1(void)
+14 {
+15   spin_lock(&my_lock);
+16   WRITE_ONCE(x, 1);
+17   WRITE_ONCE(y, 1);
+18   spin_unlock(&my_lock);
+19 }
+
+
+ +

+If the thread0() function's rcu_read_lock() +excluded the thread1() function's update, +the WARN_ON() could never fire. +But the fact is that rcu_read_lock() does not exclude +much of anything aside from subsequent grace periods, of which +thread1() has none, so the +WARN_ON() can and does fire. + +

Updaters Only Wait For Old Readers

+ +

+It might be tempting to assume that after synchronize_rcu() +completes, there are no readers executing. +This temptation must be avoided because +new readers can start immediately after synchronize_rcu() +starts, and synchronize_rcu() is under no +obligation to wait for these new readers. + +

Quick Quiz 9: +Suppose that synchronize_rcu() did wait until all readers had completed. +Would the updater be able to rely on this? +
Answer + +

+Grace Periods Don't Partition Read-Side Critical Sections

+ +

+It is tempting to assume that if any part of one RCU read-side critical +section precedes a given grace period, and if any part of another RCU +read-side critical section follows that same grace period, then all of +the first RCU read-side critical section must precede all of the second. +However, this just isn't the case: A single grace period does not +partition the set of RCU read-side critical sections. +An example of this situation can be illustrated as follows, where +x, y, and z are initially all zero: + +

+
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   WRITE_ONCE(a, 1);
+ 5   WRITE_ONCE(b, 1);
+ 6   rcu_read_unlock();
+ 7 }
+ 8
+ 9 void thread1(void)
+10 {
+11   r1 = READ_ONCE(a);
+12   synchronize_rcu();
+13   WRITE_ONCE(c, 1);
+14 }
+15
+16 void thread2(void)
+17 {
+18   rcu_read_lock();
+19   r2 = READ_ONCE(b);
+20   r3 = READ_ONCE(c);
+21   rcu_read_unlock();
+22 }
+
+
+ +

+It turns out that the outcome: + +

+
+(r1 == 1 && r2 == 0 && r3 == 1)
+
+
+ +is entirely possible. +The following figure show how this can happen, with each circled +QS indicating the point at which RCU recorded a +quiescent state for each thread, that is, a state in which +RCU knows that the thread cannot be in the midst of an RCU read-side +critical section that started before the current grace period: + +

GPpartitionReaders1.svg

+ +

+If it is necessary to partition RCU read-side critical sections in this +manner, it is necessary to use two grace periods, where the first +grace period is known to end before the second grace period starts: + +

+
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   WRITE_ONCE(a, 1);
+ 5   WRITE_ONCE(b, 1);
+ 6   rcu_read_unlock();
+ 7 }
+ 8
+ 9 void thread1(void)
+10 {
+11   r1 = READ_ONCE(a);
+12   synchronize_rcu();
+13   WRITE_ONCE(c, 1);
+14 }
+15
+16 void thread2(void)
+17 {
+18   r2 = READ_ONCE(c);
+19   synchronize_rcu();
+20   WRITE_ONCE(d, 1);
+21 }
+22
+23 void thread3(void)
+24 {
+25   rcu_read_lock();
+26   r3 = READ_ONCE(b);
+27   r4 = READ_ONCE(d);
+28   rcu_read_unlock();
+29 }
+
+
+ +

+Here, if (r1 == 1), then +thread0()'s write to b must happen +before the end of thread1()'s grace period. +If in addition (r4 == 1), then +thread3()'s read from b must happen +after the beginning of thread2()'s grace period. +If it is also the case that (r2 == 1), then the +end of thread1()'s grace period must precede the +beginning of thread2()'s grace period. +This mean that the two RCU read-side critical sections cannot overlap, +guaranteeing that (r3 == 1). +As a result, the outcome: + +

+
+(r1 == 1 && r2 == 1 && r3 == 0 && r4 == 1)
+
+
+ +cannot happen. + +

+This non-requirement was also non-premeditated, but became apparent +when studying RCU's interaction with memory ordering. + +

+Read-Side Critical Sections Don't Partition Grace Periods

+ +

+It is also tempting to assume that if an RCU read-side critical section +happens between a pair of grace periods, then those grace periods cannot +overlap. +However, this temptation leads nowhere good, as can be illustrated by +the following, with all variables initially zero: + +

+
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   WRITE_ONCE(a, 1);
+ 5   WRITE_ONCE(b, 1);
+ 6   rcu_read_unlock();
+ 7 }
+ 8
+ 9 void thread1(void)
+10 {
+11   r1 = READ_ONCE(a);
+12   synchronize_rcu();
+13   WRITE_ONCE(c, 1);
+14 }
+15
+16 void thread2(void)
+17 {
+18   rcu_read_lock();
+19   WRITE_ONCE(d, 1);
+20   r2 = READ_ONCE(c);
+21   rcu_read_unlock();
+22 }
+23
+24 void thread3(void)
+25 {
+26   r3 = READ_ONCE(d);
+27   synchronize_rcu();
+28   WRITE_ONCE(e, 1);
+29 }
+30
+31 void thread4(void)
+32 {
+33   rcu_read_lock();
+34   r4 = READ_ONCE(b);
+35   r5 = READ_ONCE(e);
+36   rcu_read_unlock();
+37 }
+
+
+ +

+In this case, the outcome: + +

+
+(r1 == 1 && r2 == 1 && r3 == 1 && r4 == 0 && r5 == 1)
+
+
+ +is entirely possible, as illustrated below: + +

ReadersPartitionGP1.svg

+ +

+Again, an RCU read-side critical section can overlap almost all of a +given grace period, just so long as it does not overlap the entire +grace period. +As a result, an RCU read-side critical section cannot partition a pair +of RCU grace periods. + +

Quick Quiz 10: +How long a sequence of grace periods, each separated by an RCU read-side +critical section, would be required to partition the RCU read-side +critical sections at the beginning and end of the chain? +
Answer + +

+Disabling Preemption Does Not Block Grace Periods

+ +

+There was a time when disabling preemption on any given CPU would block +subsequent grace periods. +However, this was an accident of implementation and is not a requirement. +And in the current Linux-kernel implementation, disabling preemption +on a given CPU in fact does not block grace periods, as Oleg Nesterov +demonstrated. + +

+If you need a preempt-disable region to block grace periods, you need to add +rcu_read_lock() and rcu_read_unlock(), for example +as follows: + +

+
+ 1 preempt_disable();
+ 2 rcu_read_lock();
+ 3 do_something();
+ 4 rcu_read_unlock();
+ 5 preempt_enable();
+ 6
+ 7 /* Spinlocks implicitly disable preemption. */
+ 8 spin_lock(&mylock);
+ 9 rcu_read_lock();
+10 do_something();
+11 rcu_read_unlock();
+12 spin_unlock(&mylock);
+
+
+ +

+In theory, you could enter the RCU read-side critical section first, +but it is more efficient to keep the entire RCU read-side critical +section contained in the preempt-disable region as shown above. +Of course, RCU read-side critical sections that extend outside of +preempt-disable regions will work correctly, but such critical sections +can be preempted, which forces rcu_read_unlock() to do +more work. +And no, this is not an invitation to enclose all of your RCU +read-side critical sections within preempt-disable regions, because +doing so would degrade real-time response. + +

+This non-requirement appeared with preemptible RCU. +If you need a grace period that waits on non-preemptible code regions, use +RCU-sched. + +

Parallelism Facts of Life

+ +

+These parallelism facts of life are by no means specific to RCU, but +the RCU implementation must abide by them. +They therefore bear repeating: + +

    +
  1. Any CPU or task may be delayed at any time, + and any attempts to avoid these delays by disabling + preemption, interrupts, or whatever are completely futile. + This is most obvious in preemptible user-level + environments and in virtualized environments (where + a given guest OS's VCPUs can be preempted at any time by + the underlying hypervisor), but can also happen in bare-metal + environments due to ECC errors, NMIs, and other hardware + events. + Although a delay of more than about 20 seconds can result + in splats, the RCU implementation is obligated to use + algorithms that can tolerate extremely long delays, but where + “extremely long” is not long enough to allow + wrap-around when incrementing a 64-bit counter. +
  2. Both the compiler and the CPU can reorder memory accesses. + Where it matters, RCU must use compiler directives and + memory-barrier instructions to preserve ordering. +
  3. Conflicting writes to memory locations in any given cache line + will result in expensive cache misses. + Greater numbers of concurrent writes and more-frequent + concurrent writes will result in more dramatic slowdowns. + RCU is therefore obligated to use algorithms that have + sufficient locality to avoid significant performance and + scalability problems. +
  4. As a rough rule of thumb, only one CPU's worth of processing + may be carried out under the protection of any given exclusive + lock. + RCU must therefore use scalable locking designs. +
  5. Counters are finite, especially on 32-bit systems. + RCU's use of counters must therefore tolerate counter wrap, + or be designed such that counter wrap would take way more + time than a single system is likely to run. + An uptime of ten years is quite possible, a runtime + of a century much less so. + As an example of the latter, RCU's dyntick-idle nesting counter + allows 54 bits for interrupt nesting level (this counter + is 64 bits even on a 32-bit system). + Overflowing this counter requires 254 + half-interrupts on a given CPU without that CPU ever going idle. + If a half-interrupt happened every microsecond, it would take + 570 years of runtime to overflow this counter, which is currently + believed to be an acceptably long time. +
  6. Linux systems can have thousands of CPUs running a single + Linux kernel in a single shared-memory environment. + RCU must therefore pay close attention to high-end scalability. +
+ +

+This last parallelism fact of life means that RCU must pay special +attention to the preceding facts of life. +The idea that Linux might scale to systems with thousands of CPUs would +have been met with some skepticism in the 1990s, but these requirements +would have otherwise have been unsurprising, even in the early 1990s. + +

Quality-of-Implementation Requirements

+ +

+These sections list quality-of-implementation requirements. +Although an RCU implementation that ignores these requirements could +still be used, it would likely be subject to limitations that would +make it inappropriate for industrial-strength production use. +Classes of quality-of-implementation requirements are as follows: + +

    +
  1. Specialization +
  2. Performance and Scalability +
  3. Composability +
  4. Corner Cases +
+ +

+These classes is covered in the following sections. + +

Specialization

+ +

+RCU is and always has been intended primarily for read-mostly situations, as +illustrated by the following figure. +This means that RCU's read-side primitives are optimized, often at the +expense of its update-side primitives. + +

RCUApplicability.svg

+ +

+This focus on read-mostly situations means that RCU must interoperate +with other synchronization primitives. +For example, the add_gp() and remove_gp_synchronous() +examples discussed earlier use RCU to protect readers and locking to +coordinate updaters. +However, the need extends much farther, requiring that a variety of +synchronization primitives be legal within RCU read-side critical sections, +including spinlocks, sequence locks, atomic operations, reference +counters, and memory barriers. + +

Quick Quiz 11: +What about sleeping locks? +
Answer + +

+It often comes as a surprise that many algorithms do not require a +consistent view of data, but many can function in that mode, +with network routing being the poster child. +Internet routing algorithms take significant time to propagate +updates, so that by the time an update arrives at a given system, +that system has been sending network traffic the wrong way for +a considerable length of time. +Having a few threads continue to send traffic the wrong way for a +few more milliseconds is clearly not a problem: In the worst case, +TCP retransmissions will eventually get the data where it needs to go. +In general, when tracking the state of the universe outside of the +computer, some level of inconsistency must be tolerated due to +speed-of-light delays if nothing else. + +

+Furthermore, uncertainty about external state is inherent in many cases. +For example, a pair of veternarians might use heartbeat to determine +whether or not a given cat was alive. +But how long should they wait after the last heartbeat to decide that +the cat is in fact dead? +Waiting less than 400 milliseconds makes no sense because this would +mean that a relaxed cat would be considered to cycle between death +and life more than 100 times per minute. +Moreover, just as with human beings, a cat's heart might stop for +some period of time, so the exact wait period is a judgment call. +One of our pair of veternarians might wait 30 seconds before pronouncing +the cat dead, while the other might insist on waiting a full minute. +The two veternarians would then disagree on the state of the cat during +the final 30 seconds of the minute following the last heartbeat, as +fancifully illustrated below: + +

2013-08-is-it-dead.png

+ +

+Interestingly enough, this same situation applies to hardware. +When push comes to shove, how do we tell whether or not some +external server has failed? +We send messages to it periodically, and declare it failed if we +don't receive a response within a given period of time. +Policy decisions can usually tolerate short +periods of inconsistency. +The policy was decided some time ago, and is only now being put into +effect, so a few milliseconds of delay is normally inconsequential. + +

+However, there are algorithms that absolutely must see consistent data. +For example, the translation between a user-level SystemV semaphore +ID to the corresponding in-kernel data structure is protected by RCU, +but it is absolutely forbidden to update a semaphore that has just been +removed. +In the Linux kernel, this need for consistency is accommodated by acquiring +spinlocks located in the in-kernel data structure from within +the RCU read-side critical section, and this is indicated by the +green box in the figure above. +Many other techniques may be used, and are in fact used within the +Linux kernel. + +

+In short, RCU is not required to maintain consistency, and other +mechanisms may be used in concert with RCU when consistency is required. +RCU's specialization allows it to do its job extremely well, and its +ability to interoperate with other synchronization mechanisms allows +the right mix of synchronization tools to be used for a given job. + +

Performance and Scalability

+ +

+Energy efficiency is a critical component of performance today, +and Linux-kernel RCU implementations must therefore avoid unnecessarily +awakening idle CPUs. +I cannot claim that this requirement was premeditated. +In fact, I learned of it during a telephone conversation in which I +was given “frank and open” feedback on the importance +of energy efficiency in battery-powered systems and on specific +energy-efficiency shortcomings of the Linux-kernel RCU implementation. +In my experience, the battery-powered embedded community will consider +any unnecessary wakeups to be extremely unfriendly acts. +So much so that mere Linux-kernel-mailing-list posts are +insufficient to vent their ire. + +

+Memory consumption is not particularly important for in most +situations, and has become decreasingly +so as memory sizes have expanded and memory +costs have plummeted. +However, as I learned from Matt Mackall's +bloatwatch +efforts, memory footprint is critically important on single-CPU systems with +non-preemptible (CONFIG_PREEMPT=n) kernels, and thus +tiny RCU +was born. +Josh Triplett has since taken over the small-memory banner with his +Linux kernel tinification +project, which resulted in +SRCU +becoming optional for those kernels not needing it. + +

+The remaining performance requirements are, for the most part, +unsurprising. +For example, in keeping with RCU's read-side specialization, +rcu_dereference() should have negligible overhead (for +example, suppression of a few minor compiler optimizations). +Similarly, in non-preemptible environments, rcu_read_lock() and +rcu_read_unlock() should have exactly zero overhead. + +

+In preemptible environments, in the case where the RCU read-side +critical section was not preempted (as will be the case for the +highest-priority real-time process), rcu_read_lock() and +rcu_read_unlock() should have minimal overhead. +In particular, they should not contain atomic read-modify-write +operations, memory-barrier instructions, preemption disabling, +interrupt disabling, or backwards branches. +However, in the case where the RCU read-side critical section was preempted, +rcu_read_unlock() may acquire spinlocks and disable interrupts. +This is why it is better to nest an RCU read-side critical section +within a preempt-disable region than vice versa, at least in cases +where that critical section is short enough to avoid unduly degrading +real-time latencies. + +

+The synchronize_rcu() grace-period-wait primitive is +optimized for throughput. +It may therefore incur several milliseconds of latency in addition to +the duration of the longest RCU read-side critical section. +On the other hand, multiple concurrent invocations of +synchronize_rcu() are required to use batching optimizations +so that they can be satisfied by a single underlying grace-period-wait +operation. +For example, in the Linux kernel, it is not unusual for a single +grace-period-wait operation to serve more than +1,000 separate invocations +of synchronize_rcu(), thus amortizing the per-invocation +overhead down to nearly zero. +However, the grace-period optimization is also required to avoid +measurable degradation of real-time scheduling and interrupt latencies. + +

+In some cases, the multi-millisecond synchronize_rcu() +latencies are unacceptable. +In these cases, synchronize_rcu_expedited() may be used +instead, reducing the grace-period latency down to a few tens of +microseconds on small systems, at least in cases where the RCU read-side +critical sections are short. +There are currently no special latency requirements for +synchronize_rcu_expedited() on large systems, but, +consistent with the empirical nature of the RCU specification, +that is subject to change. +However, there most definitely are scalability requirements: +A storm of synchronize_rcu_expedited() invocations on 4096 +CPUs should at least make reasonable forward progress. +In return for its shorter latencies, synchronize_rcu_expedited() +is permitted to impose modest degradation of real-time latency +on non-idle online CPUs. +That said, it will likely be necessary to take further steps to reduce this +degradation, hopefully to roughly that of a scheduling-clock interrupt. + +

+There are a number of situations where even +synchronize_rcu_expedited()'s reduced grace-period +latency is unacceptable. +In these situations, the asynchronous call_rcu() can be +used in place of synchronize_rcu() as follows: + +

+
+ 1 struct foo {
+ 2   int a;
+ 3   int b;
+ 4   struct rcu_head rh;
+ 5 };
+ 6
+ 7 static void remove_gp_cb(struct rcu_head *rhp)
+ 8 {
+ 9   struct foo *p = container_of(rhp, struct foo, rh);
+10
+11   kfree(p);
+12 }
+13
+14 bool remove_gp_asynchronous(void)
+15 {
+16   struct foo *p;
+17
+18   spin_lock(&gp_lock);
+19   p = rcu_dereference(gp);
+20   if (!p) {
+21     spin_unlock(&gp_lock);
+22     return false;
+23   }
+24   rcu_assign_pointer(gp, NULL);
+25   call_rcu(&p->rh, remove_gp_cb);
+26   spin_unlock(&gp_lock);
+27   return true;
+28 }
+
+
+ +

+A definition of struct foo is finally needed, and appears +on lines 1-5. +The function remove_gp_cb() is passed to call_rcu() +on line 25, and will be invoked after the end of a subsequent +grace period. +This gets the same effect as remove_gp_synchronous(), +but without forcing the updater to wait for a grace period to elapse. +The call_rcu() function may be used in a number of +situations where neither synchronize_rcu() nor +synchronize_rcu_expedited() would be legal, +including within preempt-disable code, local_bh_disable() code, +interrupt-disable code, and interrupt handlers. +However, even call_rcu() is illegal within NMI handlers. +The callback function (remove_gp_cb() in this case) will be +executed within softirq (software interrupt) environment within the +Linux kernel, +either within a real softirq handler or under the protection +of local_bh_disable(). +In both the Linux kernel and in userspace, it is bad practice to +write an RCU callback function that takes too long. +Long-running operations should be relegated to separate threads or +(in the Linux kernel) workqueues. + +

Quick Quiz 12: +Why does line 19 use rcu_access_pointer()? +After all, call_rcu() on line 25 stores into the +structure, which would interact badly with concurrent insertions. +Doesn't this mean that rcu_dereference() is required? +
Answer + +

+However, all that remove_gp_cb() is doing is +invoking kfree() on the data element. +This is a common idiom, and is supported by kfree_rcu(), +which allows “fire and forget” operation as shown below: + +

+
+ 1 struct foo {
+ 2   int a;
+ 3   int b;
+ 4   struct rcu_head rh;
+ 5 };
+ 6
+ 7 bool remove_gp_faf(void)
+ 8 {
+ 9   struct foo *p;
+10
+11   spin_lock(&gp_lock);
+12   p = rcu_dereference(gp);
+13   if (!p) {
+14     spin_unlock(&gp_lock);
+15     return false;
+16   }
+17   rcu_assign_pointer(gp, NULL);
+18   kfree_rcu(p, rh);
+19   spin_unlock(&gp_lock);
+20   return true;
+21 }
+
+
+ +

+Note that remove_gp_faf() simply invokes +kfree_rcu() and proceeds, without any need to pay any +further attention to the subsequent grace period and kfree(). +It is permissible to invoke kfree_rcu() from the same +environments as for call_rcu(). +Interestingly enough, DYNIX/ptx had the equivalents of +call_rcu() and kfree_rcu(), but not +synchronize_rcu(). +This was due to the fact that RCU was not heavily used within DYNIX/ptx, +so the very few places that needed something like +synchronize_rcu() simply open-coded it. + +

Quick Quiz 13: +Earlier it was claimed that call_rcu() and +kfree_rcu() allowed updaters to avoid being blocked +by readers. +But how can that be correct, given that the invocation of the callback +and the freeing of the memory (respectively) must still wait for +a grace period to elapse? +
Answer + +

+But what if the updater must wait for the completion of code to be +executed after the end of the grace period, but has other tasks +that can be carried out in the meantime? +The polling-style get_state_synchronize_rcu() and +cond_synchronize_rcu() functions may be used for this +purpose, as shown below: + +

+
+ 1 bool remove_gp_poll(void)
+ 2 {
+ 3   struct foo *p;
+ 4   unsigned long s;
+ 5
+ 6   spin_lock(&gp_lock);
+ 7   p = rcu_access_pointer(gp);
+ 8   if (!p) {
+ 9     spin_unlock(&gp_lock);
+10     return false;
+11   }
+12   rcu_assign_pointer(gp, NULL);
+13   spin_unlock(&gp_lock);
+14   s = get_state_synchronize_rcu();
+15   do_something_while_waiting();
+16   cond_synchronize_rcu(s);
+17   kfree(p);
+18   return true;
+19 }
+
+
+ +

+On line 14, get_state_synchronize_rcu() obtains a +“cookie” from RCU, +then line 15 carries out other tasks, +and finally, line 16 returns immediately if a grace period has +elapsed in the meantime, but otherwise waits as required. +The need for get_state_synchronize_rcu and +cond_synchronize_rcu() has appeared quite recently, +so it is too early to tell whether they will stand the test of time. + +

+RCU thus provides a range of tools to allow updaters to strike the +required tradeoff between latency, flexibility and CPU overhead. + +

Composability

+ +

+Composability has received much attention in recent years, perhaps in part +due to the collision of multicore hardware with object-oriented techniques +designed in single-threaded environments for single-threaded use. +And in theory, RCU read-side critical sections may be composed, and in +fact may be nested arbitrarily deeply. +In practice, as with all real-world implementations of composable +constructs, there are limitations. + +

+Implementations of RCU for which rcu_read_lock() +and rcu_read_unlock() generate no code, such as +Linux-kernel RCU when CONFIG_PREEMPT=n, can be +nested arbitrarily deeply. +After all, there is no overhead. +Except that if all these instances of rcu_read_lock() +and rcu_read_unlock() are visible to the compiler, +compilation will eventually fail due to exhausting memory, +mass storage, or user patience, whichever comes first. +If the nesting is not visible to the compiler, as is the case with +mutually recursive functions each in its own translation unit, +stack overflow will result. +If the nesting takes the form of loops, either the control variable +will overflow or (in the Linux kernel) you will get an RCU CPU stall warning. +Nevertheless, this class of RCU implementations is one +of the most composable constructs in existence. + +

+RCU implementations that explicitly track nesting depth +are limited by the nesting-depth counter. +For example, the Linux kernel's preemptible RCU limits nesting to +INT_MAX. +This should suffice for almost all practical purposes. +That said, a consecutive pair of RCU read-side critical sections +between which there is an operation that waits for a grace period +cannot be enclosed in another RCU read-side critical section. +This is because it is not legal to wait for a grace period within +an RCU read-side critical section: To do so would result either +in deadlock or +in RCU implicitly splitting the enclosing RCU read-side critical +section, neither of which is conducive to a long-lived and prosperous +kernel. + +

+In short, although RCU read-side critical sections are highly composable, +care is required in some situations, just as is the case for any other +composable synchronization mechanism. + +

Corner Cases

+ +

+A given RCU workload might have an endless and intense stream of +RCU read-side critical sections, perhaps even so intense that there +was never a point in time during which there was not at least one +RCU read-side critical section in flight. +RCU cannot allow this situation to block grace periods: As long as +all the RCU read-side critical sections are finite, grace periods +must also be finite. + +

+That said, preemptible RCU implementations could potentially result +in RCU read-side critical sections being preempted for long durations, +which has the effect of creating a long-duration RCU read-side +critical section. +This situation can arise only in heavily loaded systems, but systems using +real-time priorities are of course more vulnerable. +Therefore, RCU priority boosting is provided to help deal with this +case. +That said, the exact requirements on RCU priority boosting will likely +evolve as more experience accumulates. + +

+Other workloads might have very high update rates. +Although one can argue that such workloads should instead use +something other than RCU, the fact remains that RCU must +handle such workloads gracefully. +This requirement is another factor driving batching of grace periods, +but it is also the driving force behind the checks for large numbers +of queued RCU callbacks in the call_rcu() code path. +Finally, high update rates should not delay RCU read-side critical +sections, although some read-side delays can occur when using +synchronize_rcu_expedited(), courtesy of this function's use +of try_stop_cpus(). +(In the future, synchronize_rcu_expedited() will be +converted to use lighter-weight inter-processor interrupts (IPIs), +but this will still disturb readers, though to a much smaller degree.) + +

+Although all three of these corner cases were understood in the early +1990s, a simple user-level test consisting of close(open(path)) +in a tight loop +in the early 2000s suddenly provided a much deeper appreciation of the +high-update-rate corner case. +This test also motivated addition of some RCU code to react to high update +rates, for example, if a given CPU finds itself with more than 10,000 +RCU callbacks queued, it will cause RCU to take evasive action by +more aggressively starting grace periods and more aggressively forcing +completion of grace-period processing. +This evasive action causes the grace period to complete more quickly, +but at the cost of restricting RCU's batching optimizations, thus +increasing the CPU overhead incurred by that grace period. + +

+Software-Engineering Requirements

+ +

+Between Murphy's Law and “To err is human”, it is necessary to +guard against mishaps and misuse: + +

    +
  1. It is all too easy to forget to use rcu_read_lock() + everywhere that it is needed, so kernels built with + CONFIG_PROVE_RCU=y will spat if + rcu_dereference() is used outside of an + RCU read-side critical section. + Update-side code can use rcu_dereference_protected(), + which takes a + lockdep expression + to indicate what is providing the protection. + If the indicated protection is not provided, a lockdep splat + is emitted. + +

    + Code shared between readers and updaters can use + rcu_dereference_check(), which also takes a + lockdep expression, and emits a lockdep splat if neither + rcu_read_lock() nor the indicated protection + is in place. + In addition, rcu_dereference_raw() is used in those + (hopefully rare) cases where the required protection cannot + be easily described. + Finally, rcu_read_lock_held() is provided to + allow a function to verify that it has been invoked within + an RCU read-side critical section. + I was made aware of this set of requirements shortly after Thomas + Gleixner audited a number of RCU uses. +

  2. A given function might wish to check for RCU-related preconditions + upon entry, before using any other RCU API. + The rcu_lockdep_assert() does this job, + asserting the expression in kernels having lockdep enabled + and doing nothing otherwise. +
  3. It is also easy to forget to use rcu_assign_pointer() + and rcu_dereference(), perhaps (incorrectly) + substituting a simple assignment. + To catch this sort of error, a given RCU-protected pointer may be + tagged with __rcu, after which running sparse + with CONFIG_SPARSE_RCU_POINTER=y will complain + about simple-assignment accesses to that pointer. + Arnd Bergmann made me aware of this requirement, and also + supplied the needed + patch series. +
  4. Kernels built with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y + will splat if a data element is passed to call_rcu() + twice in a row, without a grace period in between. + (This error is similar to a double free.) + The corresponding rcu_head structures that are + dynamically allocated are automatically tracked, but + rcu_head structures allocated on the stack + must be initialized with init_rcu_head_on_stack() + and cleaned up with destroy_rcu_head_on_stack(). + Similarly, statically allocated non-stack rcu_head + structures must be initialized with init_rcu_head() + and cleaned up with destroy_rcu_head(). + Mathieu Desnoyers made me aware of this requirement, and also + supplied the needed + patch. +
  5. An infinite loop in an RCU read-side critical section will + eventually trigger an RCU CPU stall warning splat. + However, RCU is not obligated to produce this splat + unless there is a grace period waiting on that particular + RCU read-side critical section. + This requirement made itself known in the early 1990s, pretty + much the first time that it was necessary to debug a CPU stall. +
  6. Although it would be very good to detect pointers leaking out + of RCU read-side critical sections, there is currently no + good way of doing this. + One complication is the need to distinguish between pointers + leaking and pointers that have been handed off from RCU to + some other synchronization mechanism, for example, reference + counting. +
  7. In kernels built with CONFIG_RCU_TRACE=y, RCU-related + information is provided via both debugfs and event tracing. +
  8. Open-coded use of rcu_assign_pointer() and + rcu_dereference() to create typical linked + data structures can be surprisingly error-prone. + Therefore, RCU-protected + linked lists + and, more recently, RCU-protected + hash tables + are available. + Many other special-purpose RCU-protected data structures are + available in the Linux kernel and the userspace RCU library. +
  9. Some linked structures are created at compile time, but still + require __rcu checking. + The RCU_POINTER_INITIALIZER() macro serves this + purpose. +
  10. It is not necessary to use rcu_assign_pointer() + when creating linked structures that are to be published via + a single external pointer. + The RCU_INIT_POINTER() macro is provided for + this task and also for assigning NULL pointers + at runtime. +
+ +

+This not a hard-and-fast list: RCU's diagnostic capabilities will +continue to be guided by the number and type of usage bugs found +in real-world RCU usage. + +

Linux Kernel Complications

+ +

+The Linux kernel provides an interesting environment for all kinds of +software, including RCU. +Some of the relevant points of interest are as follows: + +

    +
  1. Configuration. +
  2. Firmware Interface. +
  3. Early Boot. +
  4. + Interrupts and non-maskable interrupts (NMIs). +
  5. Loadable Modules. +
  6. Hotplug CPU. +
  7. Scheduler and RCU. +
  8. Tracing and RCU. +
  9. Energy Efficiency. +
  10. + Performance, Scalability, Response Time, and Reliability. +
+ +

+This list is probably incomplete, but it does give a feel for the +most notable Linux-kernel complications. +Each of the following sections covers one of the above topics. + +

Configuration

+ +

+RCU's goal is automatic configuration, so that almost nobody +needs to worry about RCU's Kconfig options. +And for almost all users, RCU does in fact work well +“out of the box.” + +

+However, there are specialized use cases that are handled by +kernel boot parameters and Kconfig options. +Unfortunately, the Kconfig system will explicitly ask users +about new Kconfig options, which requires almost all of them +be hidden behind a CONFIG_RCU_EXPERT Kconfig option. + +

+This all should be quite obvious, but the fact remains that +Linus Torvalds recently had to +remind +me of this requirement. + +

Firmware Interface

+ +

+In many cases, kernel obtains information about the system from the +firmware, and sometimes things are lost in translation. +Or the translation is accurate, but the original message is bogus. + +

+For example, some systems' firmware overreports the number of CPUs, +sometimes by a large factor. +If RCU naively believed the firmware, as it used to do, +it would create too many per-CPU kthreads. +Although the resulting system will still run correctly, the extra +kthreads needlessly consume memory and can cause confusion +when they show up in ps listings. + +

+RCU must therefore wait for a given CPU to actually come online before +it can allow itself to believe that the CPU actually exists. +The resulting “ghost CPUs” (which are never going to +come online) cause a number of +interesting complications. + +

Early Boot

+ +

+The Linux kernel's boot sequence is an interesting process, +and RCU is used early, even before rcu_init() +is invoked. +In fact, a number of RCU's primitives can be used as soon as the +initial task's task_struct is available and the +boot CPU's per-CPU variables are set up. +The read-side primitives (rcu_read_lock(), +rcu_read_unlock(), rcu_dereference(), +and rcu_access_pointer()) will operate normally very early on, +as will rcu_assign_pointer(). + +

+Although call_rcu() may be invoked at any +time during boot, callbacks are not guaranteed to be invoked until after +the scheduler is fully up and running. +This delay in callback invocation is due to the fact that RCU does not +invoke callbacks until it is fully initialized, and this full initialization +cannot occur until after the scheduler has initialized itself to the +point where RCU can spawn and run its kthreads. +In theory, it would be possible to invoke callbacks earlier, +however, this is not a panacea because there would be severe restrictions +on what operations those callbacks could invoke. + +

+Perhaps surprisingly, synchronize_rcu(), +synchronize_rcu_bh() +(discussed below), +and +synchronize_sched() +will all operate normally +during very early boot, the reason being that there is only one CPU +and preemption is disabled. +This means that the call synchronize_rcu() (or friends) +itself is a quiescent +state and thus a grace period, so the early-boot implementation can +be a no-op. + +

+Both synchronize_rcu_bh() and synchronize_sched() +continue to operate normally through the remainder of boot, courtesy +of the fact that preemption is disabled across their RCU read-side +critical sections and also courtesy of the fact that there is still +only one CPU. +However, once the scheduler starts initializing, preemption is enabled. +There is still only a single CPU, but the fact that preemption is enabled +means that the no-op implementation of synchronize_rcu() no +longer works in CONFIG_PREEMPT=y kernels. +Therefore, as soon as the scheduler starts initializing, the early-boot +fastpath is disabled. +This means that synchronize_rcu() switches to its runtime +mode of operation where it posts callbacks, which in turn means that +any call to synchronize_rcu() will block until the corresponding +callback is invoked. +Unfortunately, the callback cannot be invoked until RCU's runtime +grace-period machinery is up and running, which cannot happen until +the scheduler has initialized itself sufficiently to allow RCU's +kthreads to be spawned. +Therefore, invoking synchronize_rcu() during scheduler +initialization can result in deadlock. + +

Quick Quiz 14: +So what happens with synchronize_rcu() during +scheduler initialization for CONFIG_PREEMPT=n +kernels? +
Answer + +

+I learned of these boot-time requirements as a result of a series of +system hangs. + +

Interrupts and NMIs

+ +

+The Linux kernel has interrupts, and RCU read-side critical sections are +legal within interrupt handlers and within interrupt-disabled regions +of code, as are invocations of call_rcu(). + +

+Some Linux-kernel architectures can enter an interrupt handler from +non-idle process context, and then just never leave it, instead stealthily +transitioning back to process context. +This trick is sometimes used to invoke system calls from inside the kernel. +These “half-interrupts” mean that RCU has to be very careful +about how it counts interrupt nesting levels. +I learned of this requirement the hard way during a rewrite +of RCU's dyntick-idle code. + +

+The Linux kernel has non-maskable interrupts (NMIs), and +RCU read-side critical sections are legal within NMI handlers. +Thankfully, RCU update-side primitives, including +call_rcu(), are prohibited within NMI handlers. + +

+The name notwithstanding, some Linux-kernel architectures +can have nested NMIs, which RCU must handle correctly. +Andy Lutomirski +surprised me +with this requirement; +he also kindly surprised me with +an algorithm +that meets this requirement. + +

Loadable Modules

+ +

+The Linux kernel has loadable modules, and these modules can +also be unloaded. +After a given module has been unloaded, any attempt to call +one of its functions results in a segmentation fault. +The module-unload functions must therefore cancel any +delayed calls to loadable-module functions, for example, +any outstanding mod_timer() must be dealt with +via del_timer_sync() or similar. + +

+Unfortunately, there is no way to cancel an RCU callback; +once you invoke call_rcu(), the callback function is +going to eventually be invoked, unless the system goes down first. +Because it is normally considered socially irresponsible to crash the system +in response to a module unload request, we need some other way +to deal with in-flight RCU callbacks. + +

+RCU therefore provides +rcu_barrier(), +which waits until all in-flight RCU callbacks have been invoked. +If a module uses call_rcu(), its exit function should therefore +prevent any future invocation of call_rcu(), then invoke +rcu_barrier(). +In theory, the underlying module-unload code could invoke +rcu_barrier() unconditionally, but in practice this would +incur unacceptable latencies. + +

+Nikita Danilov noted this requirement for an analogous filesystem-unmount +situation, and Dipankar Sarma incorporated rcu_barrier() into RCU. +The need for rcu_barrier() for module unloading became +apparent later. + +

Hotplug CPU

+ +

+The Linux kernel supports CPU hotplug, which means that CPUs +can come and go. +It is of course illegal to use any RCU API member from an offline CPU. +This requirement was present from day one in DYNIX/ptx, but +on the other hand, the Linux kernel's CPU-hotplug implementation +is “interesting.” + +

+The Linux-kernel CPU-hotplug implementation has notifiers that +are used to allow the various kernel subsystems (including RCU) +to respond appropriately to a given CPU-hotplug operation. +Most RCU operations may be invoked from CPU-hotplug notifiers, +including even normal synchronous grace-period operations +such as synchronize_rcu(). +However, expedited grace-period operations such as +synchronize_rcu_expedited() are not supported, +due to the fact that current implementations block CPU-hotplug +operations, which could result in deadlock. + +

+In addition, all-callback-wait operations such as +rcu_barrier() are also not supported, due to the +fact that there are phases of CPU-hotplug operations where +the outgoing CPU's callbacks will not be invoked until after +the CPU-hotplug operation ends, which could also result in deadlock. + +

Scheduler and RCU

+ +

+RCU depends on the scheduler, and the scheduler uses RCU to +protect some of its data structures. +This means the scheduler is forbidden from acquiring +the runqueue locks and the priority-inheritance locks +in the middle of an outermost RCU read-side critical section unless +it also releases them before exiting that same +RCU read-side critical section. +This same prohibition also applies to any lock that is acquired +while holding any lock to which this prohibition applies. +Violating this rule results in deadlock. + +

+For RCU's part, the preemptible-RCU rcu_read_unlock() +implementation must be written carefully to avoid similar deadlocks. +In particular, rcu_read_unlock() must tolerate an +interrupt where the interrupt handler invokes both +rcu_read_lock() and rcu_read_unlock(). +This possibility requires rcu_read_unlock() to use +negative nesting levels to avoid destructive recursion via +interrupt handler's use of RCU. + +

+This pair of mutual scheduler-RCU requirements came as a +complete surprise. + +

+As noted above, RCU makes use of kthreads, and it is necessary to +avoid excessive CPU-time accumulation by these kthreads. +This requirement was no surprise, but RCU's violation of it +when running context-switch-heavy workloads when built with +CONFIG_NO_HZ_FULL=y +did come as a surprise [PDF]. +RCU has made good progress towards meeting this requirement, even +for context-switch-have CONFIG_NO_HZ_FULL=y workloads, +but there is room for further improvement. + +

Tracing and RCU

+ +

+It is possible to use tracing on RCU code, but tracing itself +uses RCU. +For this reason, rcu_dereference_raw_notrace() +is provided for use by tracing, which avoids the destructive +recursion that could otherwise ensue. +This API is also used by virtualization in some architectures, +where RCU readers execute in environments in which tracing +cannot be used. +The tracing folks both located the requirement and provided the +needed fix, so this surprise requirement was relatively painless. + +

Energy Efficiency

+ +

+Interrupting idle CPUs is considered socially unacceptable, +especially by people with battery-powered embedded systems. +RCU therefore conserves energy by detecting which CPUs are +idle, including tracking CPUs that have been interrupted from idle. +This is a large part of the energy-efficiency requirement, +so I learned of this via an irate phone call. + +

+Because RCU avoids interrupting idle CPUs, it is illegal to +execute an RCU read-side critical section on an idle CPU. +(Kernels built with CONFIG_PROVE_RCU=y will splat +if you try it.) +The RCU_NONIDLE() macro and _rcuidle +event tracing is provided to work around this restriction. +In addition, rcu_is_watching() may be used to +test whether or not it is currently legal to run RCU read-side +critical sections on this CPU. +I learned of the need for diagnostics on the one hand +and RCU_NONIDLE() on the other while inspecting +idle-loop code. +Steven Rostedt supplied _rcuidle event tracing, +which is used quite heavily in the idle loop. + +

+It is similarly socially unacceptable to interrupt an +nohz_full CPU running in userspace. +RCU must therefore track nohz_full userspace +execution. +And in +CONFIG_NO_HZ_FULL_SYSIDLE=y +kernels, RCU must separately track idle CPUs on the one hand and +CPUs that are either idle or executing in userspace on the other. +In both cases, RCU must be able to sample state at two points in +time, and be able to determine whether or not some other CPU spent +any time idle and/or executing in userspace. + +

+These energy-efficiency requirements have proven quite difficult to +understand and to meet, for example, there have been more than five +clean-sheet rewrites of RCU's energy-efficiency code, the last of +which was finally able to demonstrate +real energy savings running on real hardware [PDF]. +As noted earlier, +I learned of many of these requirements via angry phone calls: +Flaming me on the Linux-kernel mailing list was apparently not +sufficient to fully vent their ire at RCU's energy-efficiency bugs! + +

+Performance, Scalability, Response Time, and Reliability

+ +

+Expanding on the +earlier discussion, +RCU is used heavily by hot code paths in performance-critical +portions of the Linux kernel's networking, security, virtualization, +and scheduling code paths. +RCU must therefore use efficient implementations, especially in its +read-side primitives. +To that end, it would be good if preemptible RCU's implementation +of rcu_read_lock() could be inlined, however, doing +this requires resolving #include issues with the +task_struct structure. + +

+The Linux kernel supports hardware configurations with up to +4096 CPUs, which means that RCU must be extremely scalable. +Algorithms that involve frequent acquisitions of global locks or +frequent atomic operations on global variables simply cannot be +tolerated within the RCU implementation. +RCU therefore makes heavy use of a combining tree based on the +rcu_node structure. +RCU is required to tolerate all CPUs continuously invoking any +combination of RCU's runtime primitives with minimal per-operation +overhead. +In fact, in many cases, increasing load must decrease the +per-operation overhead, witness the batching optimizations for +synchronize_rcu(), call_rcu(), +synchronize_rcu_expedited(), and rcu_barrier(). +As a general rule, RCU must cheerfully accept whatever the +rest of the Linux kernel decides to throw at it. + +

+The Linux kernel is used for real-time workloads, especially +in conjunction with the +-rt patchset. +The real-time-latency response requirements are such that the +traditional approach of disabling preemption across RCU +read-side critical sections is inappropriate. +Kernels built with CONFIG_PREEMPT=y therefore +use an RCU implementation that allows RCU read-side critical +sections to be preempted. +This requirement made its presence known after users made it +clear that an earlier +real-time patch +did not meet their needs, in conjunction with some +RCU issues +encountered by a very early version of the -rt patchset. + +

+In addition, RCU must make do with a sub-100-microsecond real-time latency +budget. +In fact, on smaller systems with the -rt patchset, the Linux kernel +provides sub-20-microsecond real-time latencies for the whole kernel, +including RCU. +RCU's scalability and latency must therefore be sufficient for +these sorts of configurations. +To my surprise, the sub-100-microsecond real-time latency budget + +applies to even the largest systems [PDF], +up to and including systems with 4096 CPUs. +This real-time requirement motivated the grace-period kthread, which +also simplified handling of a number of race conditions. + +

+Finally, RCU's status as a synchronization primitive means that +any RCU failure can result in arbitrary memory corruption that can be +extremely difficult to debug. +This means that RCU must be extremely reliable, which in +practice also means that RCU must have an aggressive stress-test +suite. +This stress-test suite is called rcutorture. + +

+Although the need for rcutorture was no surprise, +the current immense popularity of the Linux kernel is posing +interesting—and perhaps unprecedented—validation +challenges. +To see this, keep in mind that there are well over one billion +instances of the Linux kernel running today, given Android +smartphones, Linux-powered televisions, and servers. +This number can be expected to increase sharply with the advent of +the celebrated Internet of Things. + +

+Suppose that RCU contains a race condition that manifests on average +once per million years of runtime. +This bug will be occurring about three times per day across +the installed base. +RCU could simply hide behind hardware error rates, given that no one +should really expect their smartphone to last for a million years. +However, anyone taking too much comfort from this thought should +consider the fact that in most jurisdictions, a successful multi-year +test of a given mechanism, which might include a Linux kernel, +suffices for a number of types of safety-critical certifications. +In fact, rumor has it that the Linux kernel is already being used +in production for safety-critical applications. +I don't know about you, but I would feel quite bad if a bug in RCU +killed someone. +Which might explain my recent focus on validation and verification. + +

Other RCU Flavors

+ +

+One of the more surprising things about RCU is that there are now +no fewer than five flavors, or API families. +In addition, the primary flavor that has been the sole focus up to +this point has two different implementations, non-preemptible and +preemptible. +The other four flavors are listed below, with requirements for each +described in a separate section. + +

    +
  1. Bottom-Half Flavor +
  2. Sched Flavor +
  3. Sleepable RCU +
  4. Tasks RCU +
+ +

Bottom-Half Flavor

+ +

+The softirq-disable (AKA “bottom-half”, +hence the “_bh” abbreviations) +flavor of RCU, or RCU-bh, was developed by +Dipankar Sarma to provide a flavor of RCU that could withstand the +network-based denial-of-service attacks researched by Robert +Olsson. +These attacks placed so much networking load on the system +that some of the CPUs never exited softirq execution, +which in turn prevented those CPUs from ever executing a context switch, +which, in the RCU implementation of that time, prevented grace periods +from ever ending. +The result was an out-of-memory condition and a system hang. + +

+The solution was the creation of RCU-bh, which does +local_bh_disable() +across its read-side critical sections, and which uses the transition +from one type of softirq processing to another as a quiescent state +in addition to context switch, idle, user mode, and offline. +This means that RCU-bh grace periods can complete even when some of +the CPUs execute in softirq indefinitely, thus allowing algorithms +based on RCU-bh to withstand network-based denial-of-service attacks. + +

+Because +rcu_read_lock_bh() and rcu_read_unlock_bh() +disable and re-enable softirq handlers, any attempt to start a softirq +handlers during the +RCU-bh read-side critical section will be deferred. +In this case, rcu_read_unlock_bh() +will invoke softirq processing, which can take considerable time. +One can of course argue that this softirq overhead should be associated +with the code following the RCU-bh read-side critical section rather +than rcu_read_unlock_bh(), but the fact +is that most profiling tools cannot be expected to make this sort +of fine distinction. +For example, suppose that a three-millisecond-long RCU-bh read-side +critical section executes during a time of heavy networking load. +There will very likely be an attempt to invoke at least one softirq +handler during that three milliseconds, but any such invocation will +be delayed until the time of the rcu_read_unlock_bh(). +This can of course make it appear at first glance as if +rcu_read_unlock_bh() was executing very slowly. + +

+The +RCU-bh API +includes +rcu_read_lock_bh(), +rcu_read_unlock_bh(), +rcu_dereference_bh(), +rcu_dereference_bh_check(), +synchronize_rcu_bh(), +synchronize_rcu_bh_expedited(), +call_rcu_bh(), +rcu_barrier_bh(), and +rcu_read_lock_bh_held(). + +

Sched Flavor

+ +

+Before preemptible RCU, waiting for an RCU grace period had the +side effect of also waiting for all pre-existing interrupt +and NMI handlers. +However, there are legitimate preemptible-RCU implementations that +do not have this property, given that any point in the code outside +of an RCU read-side critical section can be a quiescent state. +Therefore, RCU-sched was created, which follows “classic” +RCU in that an RCU-sched grace period waits for for pre-existing +interrupt and NMI handlers. +In kernels built with CONFIG_PREEMPT=n, the RCU and RCU-sched +APIs have identical implementations, while kernels built with +CONFIG_PREEMPT=y provide a separate implementation for each. + +

+Note well that in CONFIG_PREEMPT=y kernels, +rcu_read_lock_sched() and rcu_read_unlock_sched() +disable and re-enable preemption, respectively. +This means that if there was a preemption attempt during the +RCU-sched read-side critical section, rcu_read_unlock_sched() +will enter the scheduler, with all the latency and overhead entailed. +Just as with rcu_read_unlock_bh(), this can make it look +as if rcu_read_unlock_sched() was executing very slowly. +However, the highest-priority task won't be preempted, so that task +will enjoy low-overhead rcu_read_unlock_sched() invocations. + +

+The +RCU-sched API +includes +rcu_read_lock_sched(), +rcu_read_unlock_sched(), +rcu_read_lock_sched_notrace(), +rcu_read_unlock_sched_notrace(), +rcu_dereference_sched(), +rcu_dereference_sched_check(), +synchronize_sched(), +synchronize_rcu_sched_expedited(), +call_rcu_sched(), +rcu_barrier_sched(), and +rcu_read_lock_sched_held(). +However, anything that disables preemption also marks an RCU-sched +read-side critical section, including +preempt_disable() and preempt_enable(), +local_irq_save() and local_irq_restore(), +and so on. + +

Sleepable RCU

+ +

+For well over a decade, someone saying “I need to block within +an RCU read-side critical section” was a reliable indication +that this someone did not understand RCU. +After all, if you are always blocking in an RCU read-side critical +section, you can probably afford to use a higher-overhead synchronization +mechanism. +However, that changed with the advent of the Linux kernel's notifiers, +whose RCU read-side critical +sections almost never sleep, but sometimes need to. +This resulted in the introduction of +sleepable RCU, +or SRCU. + +

+SRCU allows different domains to be defined, with each such domain +defined by an instance of an srcu_struct structure. +A pointer to this structure must be passed in to each SRCU function, +for example, synchronize_srcu(&ss), where +ss is the srcu_struct structure. +The key benefit of these domains is that a slow SRCU reader in one +domain does not delay an SRCU grace period in some other domain. +That said, one consequence of these domains is that read-side code +must pass a “cookie” from srcu_read_lock() +to srcu_read_unlock(), for example, as follows: + +

+
+ 1 int idx;
+ 2
+ 3 idx = srcu_read_lock(&ss);
+ 4 do_something();
+ 5 srcu_read_unlock(&ss, idx);
+
+
+ +

+As noted above, it is legal to block within SRCU read-side critical sections, +however, with great power comes great responsibility. +If you block forever in one of a given domain's SRCU read-side critical +sections, then that domain's grace periods will also be blocked forever. +Of course, one good way to block forever is to deadlock, which can +happen if any operation in a given domain's SRCU read-side critical +section can block waiting, either directly or indirectly, for that domain's +grace period to elapse. +For example, this results in a self-deadlock: + +

+
+ 1 int idx;
+ 2
+ 3 idx = srcu_read_lock(&ss);
+ 4 do_something();
+ 5 synchronize_srcu(&ss);
+ 6 srcu_read_unlock(&ss, idx);
+
+
+ +

+However, if line 5 acquired a mutex that was held across +a synchronize_srcu() for domain ss, +deadlock would still be possible. +Furthermore, if line 5 acquired a mutex that was held across +a synchronize_srcu() for some other domain ss1, +and if an ss1-domain SRCU read-side critical section +acquired another mutex that was held across as ss-domain +synchronize_srcu(), +deadlock would again be possible. +Such a deadlock cycle could extend across an arbitrarily large number +of different SRCU domains. +Again, with great power comes great responsibility. + +

+Unlike the other RCU flavors, SRCU read-side critical sections can +run on idle and even offline CPUs. +This ability requires that srcu_read_lock() and +srcu_read_unlock() contain memory barriers, which means +that SRCU readers will run a bit slower than would RCU readers. +It also motivates the smp_mb__after_srcu_read_unlock() +API, which, in combination with srcu_read_unlock(), +guarantees a full memory barrier. + +

+The +SRCU API +includes +srcu_read_lock(), +srcu_read_unlock(), +srcu_dereference(), +srcu_dereference_check(), +synchronize_srcu(), +synchronize_srcu_expedited(), +call_srcu(), +srcu_barrier(), and +srcu_read_lock_held(). +It also includes +DEFINE_SRCU(), +DEFINE_STATIC_SRCU(), and +init_srcu_struct() +APIs for defining and initializing srcu_struct structures. + +

Tasks RCU

+ +

+Some forms of tracing use “tramopolines” to handle the +binary rewriting required to install different types of probes. +It would be good to be able to free old trampolines, which sounds +like a job for some form of RCU. +However, because it is necessary to be able to install a trace +anywhere in the code, it is not possible to use read-side markers +such as rcu_read_lock() and rcu_read_unlock(). +In addition, it does not work to have these markers in the trampoline +itself, because there would need to be instructions following +rcu_read_unlock(). +Although synchronize_rcu() would guarantee that execution +reached the rcu_read_unlock(), it would not be able to +guarantee that execution had completely left the trampoline. + +

+The solution, in the form of +Tasks RCU, +is to have implicit +read-side critical sections that are delimited by voluntary context +switches, that is, calls to schedule(), +cond_resched_rcu_qs(), and +synchronize_rcu_tasks(). +In addition, transitions to and from userspace execution also delimit +tasks-RCU read-side critical sections. + +

+The tasks-RCU API is quite compact, consisting only of +call_rcu_tasks(), +synchronize_rcu_tasks(), and +rcu_barrier_tasks(). + +

Possible Future Changes

+ +

+One of the tricks that RCU uses to attain update-side scalability is +to increase grace-period latency with increasing numbers of CPUs. +If this becomes a serious problem, it will be necessary to rework the +grace-period state machine so as to avoid the need for the additional +latency. + +

+Expedited grace periods scan the CPUs, so their latency and overhead +increases with increasing numbers of CPUs. +If this becomes a serious problem on large systems, it will be necessary +to do some redesign to avoid this scalability problem. + +

+RCU disables CPU hotplug in a few places, perhaps most notably in the +expedited grace-period and rcu_barrier() operations. +If there is a strong reason to use expedited grace periods in CPU-hotplug +notifiers, it will be necessary to avoid disabling CPU hotplug. +This would introduce some complexity, so there had better be a very +good reason. + +

+The tradeoff between grace-period latency on the one hand and interruptions +of other CPUs on the other hand may need to be re-examined. +The desire is of course for zero grace-period latency as well as zero +interprocessor interrupts undertaken during an expedited grace period +operation. +While this ideal is unlikely to be achievable, it is quite possible that +further improvements can be made. + +

+The multiprocessor implementations of RCU use a combining tree that +groups CPUs so as to reduce lock contention and increase cache locality. +However, this combining tree does not spread its memory across NUMA +nodes nor does it align the CPU groups with hardware features such +as sockets or cores. +Such spreading and alignment is currently believed to be unnecessary +because the hotpath read-side primitives do not access the combining +tree, nor does call_rcu() in the common case. +If you believe that your architecture needs such spreading and alignment, +then your architecture should also benefit from the +rcutree.rcu_fanout_leaf boot parameter, which can be set +to the number of CPUs in a socket, NUMA node, or whatever. +If the number of CPUs is too large, use a fraction of the number of +CPUs. +If the number of CPUs is a large prime number, well, that certainly +is an “interesting” architectural choice! +More flexible arrangements might be considered, but only if +rcutree.rcu_fanout_leaf has proven inadequate, and only +if the inadequacy has been demonstrated by a carefully run and +realistic system-level workload. + +

+Please note that arrangements that require RCU to remap CPU numbers will +require extremely good demonstration of need and full exploration of +alternatives. + +

+There is an embarrassingly large number of flavors of RCU, and this +number has been increasing over time. +Perhaps it will be possible to combine some at some future date. + +

+RCU's various kthreads are reasonably recent additions. +It is quite likely that adjustments will be required to more gracefully +handle extreme loads. +It might also be necessary to be able to relate CPU utilization by +RCU's kthreads and softirq handlers to the code that instigated this +CPU utilization. +For example, RCU callback overhead might be charged back to the +originating call_rcu() instance, though probably not +in production kernels. + +

Summary

+ +

+This document has presented more than two decade's worth of RCU +requirements. +Given that the requirements keep changing, this will not be the last +word on this subject, but at least it serves to get an important +subset of the requirements set forth. + +

Acknowledgments

+ +I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar, +Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and +Andy Lutomirski for their help in rendering +this article human readable, and to Michelle Rankin for her support +of this effort. +Other contributions are acknowledged in the Linux kernel's git archive. +The cartoon is copyright (c) 2013 by Melissa Broussard, +and is provided +under the terms of the Creative Commons Attribution-Share Alike 3.0 +United States license. + +

+Answers to Quick Quizzes

+ + +

Quick Quiz 1: +Wait a minute! +You said that updaters can make useful forward progress concurrently +with readers, but pre-existing readers will block +synchronize_rcu()!!! +Just who are you trying to fool??? + + +

Answer: +First, if updaters do not wish to be blocked by readers, they can use +call_rcu() or kfree_rcu(), which will +be discussed later. +Second, even when using synchronize_rcu(), the other +update-side code does run concurrently with readers, whether pre-existing +or not. + + +

Back to Quick Quiz 1. + + +

Quick Quiz 2: +Why is the synchronize_rcu() on line 28 needed? + + +

Answer: +Without that extra grace period, memory reordering could result in +do_something_dlm() executing do_something() +concurrently with the last bits of recovery(). + + +

Back to Quick Quiz 2. + + +

Quick Quiz 3: +But rcu_assign_pointer() does nothing to prevent the +two assignments to p->a and p->b +from being reordered. +Can't that also cause problems? + + +

Answer: +No, it cannot. +The readers cannot see either of these two fields until +the assignment to gp, by which time both fields are +fully initialized. +So reordering the assignments +to p->a and p->b cannot possibly +cause any problems. + + +

Back to Quick Quiz 3. + + +

Quick Quiz 4: +Without the rcu_dereference() or the +rcu_access_pointer(), what destructive optimizations +might the compiler make use of? + + +

Answer: +Let's start with what happens to do_something_gp() +if it fails to use rcu_dereference(). +It could reuse a value formerly fetched from this same pointer. +It could also fetch the pointer from gp in a byte-at-a-time +manner, resulting in load tearing, in turn resulting a bytewise +mash-up of two distince pointer values. +It might even use value-speculation optimizations, where it makes a wrong +guess, but by the time it gets around to checking the value, an update +has changed the pointer to match the wrong guess. +Too bad about any dereferences that returned pre-initialization garbage +in the meantime! + +

+For remove_gp_synchronous(), as long as all modifications +to gp are carried out while holding gp_lock, +the above optimizations are harmless. +However, +with CONFIG_SPARSE_RCU_POINTER=y, +sparse will complain if you +define gp with __rcu and then +access it without using +either rcu_access_pointer() or rcu_dereference(). + + +

Back to Quick Quiz 4. + + +

Quick Quiz 5: +Given that multiple CPUs can start RCU read-side critical sections +at any time without any ordering whatsoever, how can RCU possibly tell whether +or not a given RCU read-side critical section starts before a +given instance of synchronize_rcu()? + + +

Answer: +If RCU cannot tell whether or not a given +RCU read-side critical section starts before a +given instance of synchronize_rcu(), +then it must assume that the RCU read-side critical section +started first. +In other words, a given instance of synchronize_rcu() +can avoid waiting on a given RCU read-side critical section only +if it can prove that synchronize_rcu() started first. + + +

Back to Quick Quiz 5. + + +

Quick Quiz 6: +The first and second guarantees require unbelievably strict ordering! +Are all these memory barriers really required? + + +

Answer: +Yes, they really are required. +To see why the first guarantee is required, consider the following +sequence of events: + +

    +
  1. CPU 1: rcu_read_lock() +
  2. CPU 1: q = rcu_dereference(gp); + /* Very likely to return p. */ +
  3. CPU 0: list_del_rcu(p); +
  4. CPU 0: synchronize_rcu() starts. +
  5. CPU 1: do_something_with(q->a); + /* No smp_mb(), so might happen after kfree(). */ +
  6. CPU 1: rcu_read_unlock() +
  7. CPU 0: synchronize_rcu() returns. +
  8. CPU 0: kfree(p); +
+ +

+Therefore, there absolutely must be a full memory barrier between the +end of the RCU read-side critical section and the end of the +grace period. + +

+The sequence of events demonstrating the necessity of the second rule +is roughly similar: + +

    +
  1. CPU 0: list_del_rcu(p); +
  2. CPU 0: synchronize_rcu() starts. +
  3. CPU 1: rcu_read_lock() +
  4. CPU 1: q = rcu_dereference(gp); + /* Might return p if no memory barrier. */ +
  5. CPU 0: synchronize_rcu() returns. +
  6. CPU 0: kfree(p); +
  7. CPU 1: do_something_with(q->a); /* Boom!!! */ +
  8. CPU 1: rcu_read_unlock() +
+ +

+And similarly, without a memory barrier between the beginning of the +grace period and the beginning of the RCU read-side critical section, +CPU 1 might end up accessing the freelist. + +

+The “as if” rule of course applies, so that any implementation +that acts as if the appropriate memory barriers were in place is a +correct implementation. +That said, it is much easier to fool yourself into believing that you have +adhered to the as-if rule than it is to actually adhere to it! + + +

Back to Quick Quiz 6. + + +

Quick Quiz 7: +But how does the upgrade-to-write operation exclude other readers? + + +

Answer: +It doesn't, just like normal RCU updates, which also do not exclude +RCU readers. + + +

Back to Quick Quiz 7. + + +

Quick Quiz 8: +Can't the compiler also reorder this code? + + +

Answer: +No, the volatile casts in READ_ONCE() and +WRITE_ONCE() prevent the compiler from reordering in +this particular case. + + +

Back to Quick Quiz 8. + + +

Quick Quiz 9: +Suppose that synchronize_rcu() did wait until all readers had completed. +Would the updater be able to rely on this? + + +

Answer: +No. +Even if synchronize_rcu() were to wait until +all readers had completed, a new reader might start immediately after +synchronize_rcu() completed. +Therefore, the code following +synchronize_rcu() cannot rely on there being no readers +in any case. + + +

Back to Quick Quiz 9. + + +

Quick Quiz 10: +How long a sequence of grace periods, each separated by an RCU read-side +critical section, would be required to partition the RCU read-side +critical sections at the beginning and end of the chain? + + +

Answer: +In theory, an infinite number. +In practice, an unknown number that is sensitive to both implementation +details and timing considerations. +Therefore, even in practice, RCU users must abide by the theoretical rather +than the practical answer. + + +

Back to Quick Quiz 10. + + +

Quick Quiz 11: +What about sleeping locks? + + +

Answer: +These are forbidden within Linux-kernel RCU read-side critical sections +because it is not legal to place a quiescent state (in this case, +voluntary context switch) within an RCU read-side critical section. +However, sleeping locks may be used within userspace RCU read-side critical +sections, and also within Linux-kernel sleepable RCU +(SRCU) +read-side critical sections. +In addition, the -rt patchset turns spinlocks into a sleeping locks so +that the corresponding critical sections can be preempted, which +also means that these sleeplockified spinlocks (but not other sleeping locks!) +may be acquire within -rt-Linux-kernel RCU read-side critical sections. + +

+Note that it is legal for a normal RCU read-side critical section +to conditionally acquire a sleeping locks (as in mutex_trylock()), +but only as long as it does not loop indefinitely attempting to +conditionally acquire that sleeping locks. +The key point is that things like mutex_trylock() +either return with the mutex held, or return an error indication if +the mutex was not immediately available. +Either way, mutex_trylock() returns immediately without sleeping. + + +

Back to Quick Quiz 11. + + +

Quick Quiz 12: +Why does line 19 use rcu_access_pointer()? +After all, call_rcu() on line 25 stores into the +structure, which would interact badly with concurrent insertions. +Doesn't this mean that rcu_dereference() is required? + + +

Answer: +Presumably the ->gp_lock acquired on line 18 excludes +any changes, including any insertions that rcu_dereference() +would protect against. +Therefore, any insertions will be delayed until after ->gp_lock +is released on line 25, which in turn means that +rcu_access_pointer() suffices. + + +

Back to Quick Quiz 12. + + +

Quick Quiz 13: +Earlier it was claimed that call_rcu() and +kfree_rcu() allowed updaters to avoid being blocked +by readers. +But how can that be correct, given that the invocation of the callback +and the freeing of the memory (respectively) must still wait for +a grace period to elapse? + + +

Answer: +We could define things this way, but keep in mind that this sort of +definition would say that updates in garbage-collected languages +cannot complete until the next time the garbage collector runs, +which does not seem at all reasonable. +The key point is that in most cases, an updater using either +call_rcu() or kfree_rcu() can proceed to the +next update as soon as it has invoked call_rcu() or +kfree_rcu(), without having to wait for a subsequent +grace period. + + +

Back to Quick Quiz 13. + + +

Quick Quiz 14: +So what happens with synchronize_rcu() during +scheduler initialization for CONFIG_PREEMPT=n +kernels? + + +

Answer: +In CONFIG_PREEMPT=n kernel, synchronize_rcu() +maps directly to synchronize_sched(). +Therefore, synchronize_rcu() works normally throughout +boot in CONFIG_PREEMPT=n kernels. +However, your code must also work in CONFIG_PREEMPT=y kernels, +so it is still necessary to avoid invoking synchronize_rcu() +during scheduler initialization. + + +

Back to Quick Quiz 14. + + + diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx new file mode 100644 index 000000000000..1168010c39fe --- /dev/null +++ b/Documentation/RCU/Design/Requirements/Requirements.htmlx @@ -0,0 +1,2643 @@ + + + A Tour Through RCU's Requirements [LWN.net] + + +

A Tour Through RCU's Requirements

+ +

Copyright IBM Corporation, 2015

+

Author: Paul E. McKenney

+

The initial version of this document appeared in the +LWN articles +here, +here, and +here.

+ +

Introduction

+ +

+Read-copy update (RCU) is a synchronization mechanism that is often +used as a replacement for reader-writer locking. +RCU is unusual in that updaters do not block readers, +which means that RCU's read-side primitives can be exceedingly fast +and scalable. +In addition, updaters can make useful forward progress concurrently +with readers. +However, all this concurrency between RCU readers and updaters does raise +the question of exactly what RCU readers are doing, which in turn +raises the question of exactly what RCU's requirements are. + +

+This document therefore summarizes RCU's requirements, and can be thought +of as an informal, high-level specification for RCU. +It is important to understand that RCU's specification is primarily +empirical in nature; +in fact, I learned about many of these requirements the hard way. +This situation might cause some consternation, however, not only +has this learning process been a lot of fun, but it has also been +a great privilege to work with so many people willing to apply +technologies in interesting new ways. + +

+All that aside, here are the categories of currently known RCU requirements: +

+ +
    +
  1. + Fundamental Requirements +
  2. Fundamental Non-Requirements +
  3. + Parallelism Facts of Life +
  4. + Quality-of-Implementation Requirements +
  5. + Linux Kernel Complications +
  6. + Software-Engineering Requirements +
  7. + Other RCU Flavors +
  8. + Possible Future Changes +
+ +

+This is followed by a summary, +which is in turn followed by the inevitable +answers to the quick quizzes. + +

Fundamental Requirements

+ +

+RCU's fundamental requirements are the closest thing RCU has to hard +mathematical requirements. +These are: + +

    +
  1. + Grace-Period Guarantee +
  2. + Publish-Subscribe Guarantee +
  3. + RCU Primitives Guaranteed to Execute Unconditionally +
  4. + Guaranteed Read-to-Write Upgrade +
+ +

Grace-Period Guarantee

+ +

+RCU's grace-period guarantee is unusual in being premeditated: +Jack Slingwine and I had this guarantee firmly in mind when we started +work on RCU (then called “rclock”) in the early 1990s. +That said, the past two decades of experience with RCU have produced +a much more detailed understanding of this guarantee. + +

+RCU's grace-period guarantee allows updaters to wait for the completion +of all pre-existing RCU read-side critical sections. +An RCU read-side critical section +begins with the marker rcu_read_lock() and ends with +the marker rcu_read_unlock(). +These markers may be nested, and RCU treats a nested set as one +big RCU read-side critical section. +Production-quality implementations of rcu_read_lock() and +rcu_read_unlock() are extremely lightweight, and in +fact have exactly zero overhead in Linux kernels built for production +use with CONFIG_PREEMPT=n. + +

+This guarantee allows ordering to be enforced with extremely low +overhead to readers, for example: + +

+
+ 1 int x, y;
+ 2
+ 3 void thread0(void)
+ 4 {
+ 5   rcu_read_lock();
+ 6   r1 = READ_ONCE(x);
+ 7   r2 = READ_ONCE(y);
+ 8   rcu_read_unlock();
+ 9 }
+10
+11 void thread1(void)
+12 {
+13   WRITE_ONCE(x, 1);
+14   synchronize_rcu();
+15   WRITE_ONCE(y, 1);
+16 }
+
+
+ +

+Because the synchronize_rcu() on line 14 waits for +all pre-existing readers, any instance of thread0() that +loads a value of zero from x must complete before +thread1() stores to y, so that instance must +also load a value of zero from y. +Similarly, any instance of thread0() that loads a value of +one from y must have started after the +synchronize_rcu() started, and must therefore also load +a value of one from x. +Therefore, the outcome: +

+
+(r1 == 0 && r2 == 1)
+
+
+cannot happen. + +

@@QQ@@ +Wait a minute! +You said that updaters can make useful forward progress concurrently +with readers, but pre-existing readers will block +synchronize_rcu()!!! +Just who are you trying to fool??? +

@@QQA@@ +First, if updaters do not wish to be blocked by readers, they can use +call_rcu() or kfree_rcu(), which will +be discussed later. +Second, even when using synchronize_rcu(), the other +update-side code does run concurrently with readers, whether pre-existing +or not. +

@@QQE@@ + +

+This scenario resembles one of the first uses of RCU in +DYNIX/ptx, +which managed a distributed lock manager's transition into +a state suitable for handling recovery from node failure, +more or less as follows: + +

+
+ 1 #define STATE_NORMAL        0
+ 2 #define STATE_WANT_RECOVERY 1
+ 3 #define STATE_RECOVERING    2
+ 4 #define STATE_WANT_NORMAL   3
+ 5
+ 6 int state = STATE_NORMAL;
+ 7
+ 8 void do_something_dlm(void)
+ 9 {
+10   int state_snap;
+11
+12   rcu_read_lock();
+13   state_snap = READ_ONCE(state);
+14   if (state_snap == STATE_NORMAL)
+15     do_something();
+16   else
+17     do_something_carefully();
+18   rcu_read_unlock();
+19 }
+20
+21 void start_recovery(void)
+22 {
+23   WRITE_ONCE(state, STATE_WANT_RECOVERY);
+24   synchronize_rcu();
+25   WRITE_ONCE(state, STATE_RECOVERING);
+26   recovery();
+27   WRITE_ONCE(state, STATE_WANT_NORMAL);
+28   synchronize_rcu();
+29   WRITE_ONCE(state, STATE_NORMAL);
+30 }
+
+
+ +

+The RCU read-side critical section in do_something_dlm() +works with the synchronize_rcu() in start_recovery() +to guarantee that do_something() never runs concurrently +with recovery(), but with little or no synchronization +overhead in do_something_dlm(). + +

@@QQ@@ +Why is the synchronize_rcu() on line 28 needed? +

@@QQA@@ +Without that extra grace period, memory reordering could result in +do_something_dlm() executing do_something() +concurrently with the last bits of recovery(). +

@@QQE@@ + +

+In order to avoid fatal problems such as deadlocks, +an RCU read-side critical section must not contain calls to +synchronize_rcu(). +Similarly, an RCU read-side critical section must not +contain anything that waits, directly or indirectly, on completion of +an invocation of synchronize_rcu(). + +

+Although RCU's grace-period guarantee is useful in and of itself, with +quite a few use cases, +it would be good to be able to use RCU to coordinate read-side +access to linked data structures. +For this, the grace-period guarantee is not sufficient, as can +be seen in function add_gp_buggy() below. +We will look at the reader's code later, but in the meantime, just think of +the reader as locklessly picking up the gp pointer, +and, if the value loaded is non-NULL, locklessly accessing the +->a and ->b fields. + +

+
+ 1 bool add_gp_buggy(int a, int b)
+ 2 {
+ 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
+ 4   if (!p)
+ 5     return -ENOMEM;
+ 6   spin_lock(&gp_lock);
+ 7   if (rcu_access_pointer(gp)) {
+ 8     spin_unlock(&gp_lock);
+ 9     return false;
+10   }
+11   p->a = a;
+12   p->b = a;
+13   gp = p; /* ORDERING BUG */
+14   spin_unlock(&gp_lock);
+15   return true;
+16 }
+
+
+ +

+The problem is that both the compiler and weakly ordered CPUs are within +their rights to reorder this code as follows: + +

+
+ 1 bool add_gp_buggy_optimized(int a, int b)
+ 2 {
+ 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
+ 4   if (!p)
+ 5     return -ENOMEM;
+ 6   spin_lock(&gp_lock);
+ 7   if (rcu_access_pointer(gp)) {
+ 8     spin_unlock(&gp_lock);
+ 9     return false;
+10   }
+11   gp = p; /* ORDERING BUG */
+12   p->a = a;
+13   p->b = a;
+14   spin_unlock(&gp_lock);
+15   return true;
+16 }
+
+
+ +

+If an RCU reader fetches gp just after +add_gp_buggy_optimized executes line 11, +it will see garbage in the ->a and ->b +fields. +And this is but one of many ways in which compiler and hardware optimizations +could cause trouble. +Therefore, we clearly need some way to prevent the compiler and the CPU from +reordering in this manner, which brings us to the publish-subscribe +guarantee discussed in the next section. + +

Publish/Subscribe Guarantee

+ +

+RCU's publish-subscribe guarantee allows data to be inserted +into a linked data structure without disrupting RCU readers. +The updater uses rcu_assign_pointer() to insert the +new data, and readers use rcu_dereference() to +access data, whether new or old. +The following shows an example of insertion: + +

+
+ 1 bool add_gp(int a, int b)
+ 2 {
+ 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
+ 4   if (!p)
+ 5     return -ENOMEM;
+ 6   spin_lock(&gp_lock);
+ 7   if (rcu_access_pointer(gp)) {
+ 8     spin_unlock(&gp_lock);
+ 9     return false;
+10   }
+11   p->a = a;
+12   p->b = a;
+13   rcu_assign_pointer(gp, p);
+14   spin_unlock(&gp_lock);
+15   return true;
+16 }
+
+
+ +

+The rcu_assign_pointer() on line 13 is conceptually +equivalent to a simple assignment statement, but also guarantees +that its assignment will +happen after the two assignments in lines 11 and 12, +similar to the C11 memory_order_release store operation. +It also prevents any number of “interesting” compiler +optimizations, for example, the use of gp as a scratch +location immediately preceding the assignment. + +

@@QQ@@ +But rcu_assign_pointer() does nothing to prevent the +two assignments to p->a and p->b +from being reordered. +Can't that also cause problems? +

@@QQA@@ +No, it cannot. +The readers cannot see either of these two fields until +the assignment to gp, by which time both fields are +fully initialized. +So reordering the assignments +to p->a and p->b cannot possibly +cause any problems. +

@@QQE@@ + +

+It is tempting to assume that the reader need not do anything special +to control its accesses to the RCU-protected data, +as shown in do_something_gp_buggy() below: + +

+
+ 1 bool do_something_gp_buggy(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   p = gp;  /* OPTIMIZATIONS GALORE!!! */
+ 5   if (p) {
+ 6     do_something(p->a, p->b);
+ 7     rcu_read_unlock();
+ 8     return true;
+ 9   }
+10   rcu_read_unlock();
+11   return false;
+12 }
+
+
+ +

+However, this temptation must be resisted because there are a +surprisingly large number of ways that the compiler +(to say nothing of +DEC Alpha CPUs) +can trip this code up. +For but one example, if the compiler were short of registers, it +might choose to refetch from gp rather than keeping +a separate copy in p as follows: + +

+
+ 1 bool do_something_gp_buggy_optimized(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   if (gp) { /* OPTIMIZATIONS GALORE!!! */
+ 5     do_something(gp->a, gp->b);
+ 6     rcu_read_unlock();
+ 7     return true;
+ 8   }
+ 9   rcu_read_unlock();
+10   return false;
+11 }
+
+
+ +

+If this function ran concurrently with a series of updates that +replaced the current structure with a new one, +the fetches of gp->a +and gp->b might well come from two different structures, +which could cause serious confusion. +To prevent this (and much else besides), do_something_gp() uses +rcu_dereference() to fetch from gp: + +

+
+ 1 bool do_something_gp(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   p = rcu_dereference(gp);
+ 5   if (p) {
+ 6     do_something(p->a, p->b);
+ 7     rcu_read_unlock();
+ 8     return true;
+ 9   }
+10   rcu_read_unlock();
+11   return false;
+12 }
+
+
+ +

+The rcu_dereference() uses volatile casts and (for DEC Alpha) +memory barriers in the Linux kernel. +Should a +high-quality implementation of C11 memory_order_consume [PDF] +ever appear, then rcu_dereference() could be implemented +as a memory_order_consume load. +Regardless of the exact implementation, a pointer fetched by +rcu_dereference() may not be used outside of the +outermost RCU read-side critical section containing that +rcu_dereference(), unless protection of +the corresponding data element has been passed from RCU to some +other synchronization mechanism, most commonly locking or +reference counting. + +

+In short, updaters use rcu_assign_pointer() and readers +use rcu_dereference(), and these two RCU API elements +work together to ensure that readers have a consistent view of +newly added data elements. + +

+Of course, it is also necessary to remove elements from RCU-protected +data structures, for example, using the following process: + +

    +
  1. Remove the data element from the enclosing structure. +
  2. Wait for all pre-existing RCU read-side critical sections + to complete (because only pre-existing readers can possibly have + a reference to the newly removed data element). +
  3. At this point, only the updater has a reference to the + newly removed data element, so it can safely reclaim + the data element, for example, by passing it to kfree(). +
+ +This process is implemented by remove_gp_synchronous(): + +
+
+ 1 bool remove_gp_synchronous(void)
+ 2 {
+ 3   struct foo *p;
+ 4
+ 5   spin_lock(&gp_lock);
+ 6   p = rcu_access_pointer(gp);
+ 7   if (!p) {
+ 8     spin_unlock(&gp_lock);
+ 9     return false;
+10   }
+11   rcu_assign_pointer(gp, NULL);
+12   spin_unlock(&gp_lock);
+13   synchronize_rcu();
+14   kfree(p);
+15   return true;
+16 }
+
+
+ +

+This function is straightforward, with line 13 waiting for a grace +period before line 14 frees the old data element. +This waiting ensures that readers will reach line 7 of +do_something_gp() before the data element referenced by +p is freed. +The rcu_access_pointer() on line 6 is similar to +rcu_dereference(), except that: + +

    +
  1. The value returned by rcu_access_pointer() + cannot be dereferenced. + If you want to access the value pointed to as well as + the pointer itself, use rcu_dereference() + instead of rcu_access_pointer(). +
  2. The call to rcu_access_pointer() need not be + protected. + In contrast, rcu_dereference() must either be + within an RCU read-side critical section or in a code + segment where the pointer cannot change, for example, in + code protected by the corresponding update-side lock. +
+ +

@@QQ@@ +Without the rcu_dereference() or the +rcu_access_pointer(), what destructive optimizations +might the compiler make use of? +

@@QQA@@ +Let's start with what happens to do_something_gp() +if it fails to use rcu_dereference(). +It could reuse a value formerly fetched from this same pointer. +It could also fetch the pointer from gp in a byte-at-a-time +manner, resulting in load tearing, in turn resulting a bytewise +mash-up of two distince pointer values. +It might even use value-speculation optimizations, where it makes a wrong +guess, but by the time it gets around to checking the value, an update +has changed the pointer to match the wrong guess. +Too bad about any dereferences that returned pre-initialization garbage +in the meantime! + +

+For remove_gp_synchronous(), as long as all modifications +to gp are carried out while holding gp_lock, +the above optimizations are harmless. +However, +with CONFIG_SPARSE_RCU_POINTER=y, +sparse will complain if you +define gp with __rcu and then +access it without using +either rcu_access_pointer() or rcu_dereference(). +

@@QQE@@ + +

+This simple linked-data-structure scenario clearly demonstrates the need +for RCU's stringent memory-ordering guarantees on systems with more than +one CPU: + +

    +
  1. Each CPU that has an RCU read-side critical section that + begins before synchronize_rcu() starts is + guaranteed to execute a full memory barrier between the time + that the RCU read-side critical section ends and the time that + synchronize_rcu() returns. + Without this guarantee, a pre-existing RCU read-side critical section + might hold a reference to the newly removed struct foo + after the kfree() on line 14 of + remove_gp_synchronous(). +
  2. Each CPU that has an RCU read-side critical section that ends + after synchronize_rcu() returns is guaranteed + to execute a full memory barrier between the time that + synchronize_rcu() begins and the time that the RCU + read-side critical section begins. + Without this guarantee, a later RCU read-side critical section + running after the kfree() on line 14 of + remove_gp_synchronous() might + later run do_something_gp() and find the + newly deleted struct foo. +
  3. If the task invoking synchronize_rcu() remains + on a given CPU, then that CPU is guaranteed to execute a full + memory barrier sometime during the execution of + synchronize_rcu(). + This guarantee ensures that the kfree() on + line 14 of remove_gp_synchronous() really does + execute after the removal on line 11. +
  4. If the task invoking synchronize_rcu() migrates + among a group of CPUs during that invocation, then each of the + CPUs in that group is guaranteed to execute a full memory barrier + sometime during the execution of synchronize_rcu(). + This guarantee also ensures that the kfree() on + line 14 of remove_gp_synchronous() really does + execute after the removal on + line 11, but also in the case where the thread executing the + synchronize_rcu() migrates in the meantime. +
+ +

@@QQ@@ +Given that multiple CPUs can start RCU read-side critical sections +at any time without any ordering whatsoever, how can RCU possibly tell whether +or not a given RCU read-side critical section starts before a +given instance of synchronize_rcu()? +

@@QQA@@ +If RCU cannot tell whether or not a given +RCU read-side critical section starts before a +given instance of synchronize_rcu(), +then it must assume that the RCU read-side critical section +started first. +In other words, a given instance of synchronize_rcu() +can avoid waiting on a given RCU read-side critical section only +if it can prove that synchronize_rcu() started first. +

@@QQE@@ + +

@@QQ@@ +The first and second guarantees require unbelievably strict ordering! +Are all these memory barriers really required? +

@@QQA@@ +Yes, they really are required. +To see why the first guarantee is required, consider the following +sequence of events: + +

    +
  1. CPU 1: rcu_read_lock() +
  2. CPU 1: q = rcu_dereference(gp); + /* Very likely to return p. */ +
  3. CPU 0: list_del_rcu(p); +
  4. CPU 0: synchronize_rcu() starts. +
  5. CPU 1: do_something_with(q->a); + /* No smp_mb(), so might happen after kfree(). */ +
  6. CPU 1: rcu_read_unlock() +
  7. CPU 0: synchronize_rcu() returns. +
  8. CPU 0: kfree(p); +
+ +

+Therefore, there absolutely must be a full memory barrier between the +end of the RCU read-side critical section and the end of the +grace period. + +

+The sequence of events demonstrating the necessity of the second rule +is roughly similar: + +

    +
  1. CPU 0: list_del_rcu(p); +
  2. CPU 0: synchronize_rcu() starts. +
  3. CPU 1: rcu_read_lock() +
  4. CPU 1: q = rcu_dereference(gp); + /* Might return p if no memory barrier. */ +
  5. CPU 0: synchronize_rcu() returns. +
  6. CPU 0: kfree(p); +
  7. CPU 1: do_something_with(q->a); /* Boom!!! */ +
  8. CPU 1: rcu_read_unlock() +
+ +

+And similarly, without a memory barrier between the beginning of the +grace period and the beginning of the RCU read-side critical section, +CPU 1 might end up accessing the freelist. + +

+The “as if” rule of course applies, so that any implementation +that acts as if the appropriate memory barriers were in place is a +correct implementation. +That said, it is much easier to fool yourself into believing that you have +adhered to the as-if rule than it is to actually adhere to it! +

@@QQE@@ + +

+In short, RCU's publish-subscribe guarantee is provided by the combination +of rcu_assign_pointer() and rcu_dereference(). +This guarantee allows data elements to be safely added to RCU-protected +linked data structures without disrupting RCU readers. +This guarantee can be used in combination with the grace-period +guarantee to also allow data elements to be removed from RCU-protected +linked data structures, again without disrupting RCU readers. + +

+This guarantee was only partially premeditated. +DYNIX/ptx used an explicit memory barrier for publication, but had nothing +resembling rcu_dereference() for subscription, nor did it +have anything resembling the smp_read_barrier_depends() +that was later subsumed into rcu_dereference(). +The need for these operations made itself known quite suddenly at a +late-1990s meeting with the DEC Alpha architects, back in the days when +DEC was still a free-standing company. +It took the Alpha architects a good hour to convince me that any sort +of barrier would ever be needed, and it then took me a good two hours +to convince them that their documentation did not make this point clear. +More recent work with the C and C++ standards committees have provided +much education on tricks and traps from the compiler. +In short, compilers were much less tricky in the early 1990s, but in +2015, don't even think about omitting rcu_dereference()! + +

RCU Primitives Guaranteed to Execute Unconditionally

+ +

+The common-case RCU primitives are unconditional. +They are invoked, they do their job, and they return, with no possibility +of error, and no need to retry. +This is a key RCU design philosophy. + +

+However, this philosophy is pragmatic rather than pigheaded. +If someone comes up with a good justification for a particular conditional +RCU primitive, it might well be implemented and added. +After all, this guarantee was reverse-engineered, not premeditated. +The unconditional nature of the RCU primitives was initially an +accident of implementation, and later experience with synchronization +primitives with conditional primitives caused me to elevate this +accident to a guarantee. +Therefore, the justification for adding a conditional primitive to +RCU would need to be based on detailed and compelling use cases. + +

Guaranteed Read-to-Write Upgrade

+ +

+As far as RCU is concerned, it is always possible to carry out an +update within an RCU read-side critical section. +For example, that RCU read-side critical section might search for +a given data element, and then might acquire the update-side +spinlock in order to update that element, all while remaining +in that RCU read-side critical section. +Of course, it is necessary to exit the RCU read-side critical section +before invoking synchronize_rcu(), however, this +inconvenience can be avoided through use of the +call_rcu() and kfree_rcu() API members +described later in this document. + +

@@QQ@@ +But how does the upgrade-to-write operation exclude other readers? +

@@QQA@@ +It doesn't, just like normal RCU updates, which also do not exclude +RCU readers. +

@@QQE@@ + +

+This guarantee allows lookup code to be shared between read-side +and update-side code, and was premeditated, appearing in the earliest +DYNIX/ptx RCU documentation. + +

Fundamental Non-Requirements

+ +

+RCU provides extremely lightweight readers, and its read-side guarantees, +though quite useful, are correspondingly lightweight. +It is therefore all too easy to assume that RCU is guaranteeing more +than it really is. +Of course, the list of things that RCU does not guarantee is infinitely +long, however, the following sections list a few non-guarantees that +have caused confusion. +Except where otherwise noted, these non-guarantees were premeditated. + +

    +
  1. + Readers Impose Minimal Ordering +
  2. + Readers Do Not Exclude Updaters +
  3. + Updaters Only Wait For Old Readers +
  4. + Grace Periods Don't Partition Read-Side Critical Sections +
  5. + Read-Side Critical Sections Don't Partition Grace Periods +
  6. + Disabling Preemption Does Not Block Grace Periods +
+ +

Readers Impose Minimal Ordering

+ +

+Reader-side markers such as rcu_read_lock() and +rcu_read_unlock() provide absolutely no ordering guarantees +except through their interaction with the grace-period APIs such as +synchronize_rcu(). +To see this, consider the following pair of threads: + +

+
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   WRITE_ONCE(x, 1);
+ 5   rcu_read_unlock();
+ 6   rcu_read_lock();
+ 7   WRITE_ONCE(y, 1);
+ 8   rcu_read_unlock();
+ 9 }
+10
+11 void thread1(void)
+12 {
+13   rcu_read_lock();
+14   r1 = READ_ONCE(y);
+15   rcu_read_unlock();
+16   rcu_read_lock();
+17   r2 = READ_ONCE(x);
+18   rcu_read_unlock();
+19 }
+
+
+ +

+After thread0() and thread1() execute +concurrently, it is quite possible to have + +

+
+(r1 == 1 && r2 == 0)
+
+
+ +(that is, y appears to have been assigned before x), +which would not be possible if rcu_read_lock() and +rcu_read_unlock() had much in the way of ordering +properties. +But they do not, so the CPU is within its rights +to do significant reordering. +This is by design: Any significant ordering constraints would slow down +these fast-path APIs. + +

@@QQ@@ +Can't the compiler also reorder this code? +

@@QQA@@ +No, the volatile casts in READ_ONCE() and +WRITE_ONCE() prevent the compiler from reordering in +this particular case. +

@@QQE@@ + +

Readers Do Not Exclude Updaters

+ +

+Neither rcu_read_lock() nor rcu_read_unlock() +exclude updates. +All they do is to prevent grace periods from ending. +The following example illustrates this: + +

+
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   r1 = READ_ONCE(y);
+ 5   if (r1) {
+ 6     do_something_with_nonzero_x();
+ 7     r2 = READ_ONCE(x);
+ 8     WARN_ON(!r2); /* BUG!!! */
+ 9   }
+10   rcu_read_unlock();
+11 }
+12
+13 void thread1(void)
+14 {
+15   spin_lock(&my_lock);
+16   WRITE_ONCE(x, 1);
+17   WRITE_ONCE(y, 1);
+18   spin_unlock(&my_lock);
+19 }
+
+
+ +

+If the thread0() function's rcu_read_lock() +excluded the thread1() function's update, +the WARN_ON() could never fire. +But the fact is that rcu_read_lock() does not exclude +much of anything aside from subsequent grace periods, of which +thread1() has none, so the +WARN_ON() can and does fire. + +

Updaters Only Wait For Old Readers

+ +

+It might be tempting to assume that after synchronize_rcu() +completes, there are no readers executing. +This temptation must be avoided because +new readers can start immediately after synchronize_rcu() +starts, and synchronize_rcu() is under no +obligation to wait for these new readers. + +

@@QQ@@ +Suppose that synchronize_rcu() did wait until all readers had completed. +Would the updater be able to rely on this? +

@@QQA@@ +No. +Even if synchronize_rcu() were to wait until +all readers had completed, a new reader might start immediately after +synchronize_rcu() completed. +Therefore, the code following +synchronize_rcu() cannot rely on there being no readers +in any case. +

@@QQE@@ + +

+Grace Periods Don't Partition Read-Side Critical Sections

+ +

+It is tempting to assume that if any part of one RCU read-side critical +section precedes a given grace period, and if any part of another RCU +read-side critical section follows that same grace period, then all of +the first RCU read-side critical section must precede all of the second. +However, this just isn't the case: A single grace period does not +partition the set of RCU read-side critical sections. +An example of this situation can be illustrated as follows, where +x, y, and z are initially all zero: + +

+
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   WRITE_ONCE(a, 1);
+ 5   WRITE_ONCE(b, 1);
+ 6   rcu_read_unlock();
+ 7 }
+ 8
+ 9 void thread1(void)
+10 {
+11   r1 = READ_ONCE(a);
+12   synchronize_rcu();
+13   WRITE_ONCE(c, 1);
+14 }
+15
+16 void thread2(void)
+17 {
+18   rcu_read_lock();
+19   r2 = READ_ONCE(b);
+20   r3 = READ_ONCE(c);
+21   rcu_read_unlock();
+22 }
+
+
+ +

+It turns out that the outcome: + +

+
+(r1 == 1 && r2 == 0 && r3 == 1)
+
+
+ +is entirely possible. +The following figure show how this can happen, with each circled +QS indicating the point at which RCU recorded a +quiescent state for each thread, that is, a state in which +RCU knows that the thread cannot be in the midst of an RCU read-side +critical section that started before the current grace period: + +

GPpartitionReaders1.svg

+ +

+If it is necessary to partition RCU read-side critical sections in this +manner, it is necessary to use two grace periods, where the first +grace period is known to end before the second grace period starts: + +

+
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   WRITE_ONCE(a, 1);
+ 5   WRITE_ONCE(b, 1);
+ 6   rcu_read_unlock();
+ 7 }
+ 8
+ 9 void thread1(void)
+10 {
+11   r1 = READ_ONCE(a);
+12   synchronize_rcu();
+13   WRITE_ONCE(c, 1);
+14 }
+15
+16 void thread2(void)
+17 {
+18   r2 = READ_ONCE(c);
+19   synchronize_rcu();
+20   WRITE_ONCE(d, 1);
+21 }
+22
+23 void thread3(void)
+24 {
+25   rcu_read_lock();
+26   r3 = READ_ONCE(b);
+27   r4 = READ_ONCE(d);
+28   rcu_read_unlock();
+29 }
+
+
+ +

+Here, if (r1 == 1), then +thread0()'s write to b must happen +before the end of thread1()'s grace period. +If in addition (r4 == 1), then +thread3()'s read from b must happen +after the beginning of thread2()'s grace period. +If it is also the case that (r2 == 1), then the +end of thread1()'s grace period must precede the +beginning of thread2()'s grace period. +This mean that the two RCU read-side critical sections cannot overlap, +guaranteeing that (r3 == 1). +As a result, the outcome: + +

+
+(r1 == 1 && r2 == 1 && r3 == 0 && r4 == 1)
+
+
+ +cannot happen. + +

+This non-requirement was also non-premeditated, but became apparent +when studying RCU's interaction with memory ordering. + +

+Read-Side Critical Sections Don't Partition Grace Periods

+ +

+It is also tempting to assume that if an RCU read-side critical section +happens between a pair of grace periods, then those grace periods cannot +overlap. +However, this temptation leads nowhere good, as can be illustrated by +the following, with all variables initially zero: + +

+
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   WRITE_ONCE(a, 1);
+ 5   WRITE_ONCE(b, 1);
+ 6   rcu_read_unlock();
+ 7 }
+ 8
+ 9 void thread1(void)
+10 {
+11   r1 = READ_ONCE(a);
+12   synchronize_rcu();
+13   WRITE_ONCE(c, 1);
+14 }
+15
+16 void thread2(void)
+17 {
+18   rcu_read_lock();
+19   WRITE_ONCE(d, 1);
+20   r2 = READ_ONCE(c);
+21   rcu_read_unlock();
+22 }
+23
+24 void thread3(void)
+25 {
+26   r3 = READ_ONCE(d);
+27   synchronize_rcu();
+28   WRITE_ONCE(e, 1);
+29 }
+30
+31 void thread4(void)
+32 {
+33   rcu_read_lock();
+34   r4 = READ_ONCE(b);
+35   r5 = READ_ONCE(e);
+36   rcu_read_unlock();
+37 }
+
+
+ +

+In this case, the outcome: + +

+
+(r1 == 1 && r2 == 1 && r3 == 1 && r4 == 0 && r5 == 1)
+
+
+ +is entirely possible, as illustrated below: + +

ReadersPartitionGP1.svg

+ +

+Again, an RCU read-side critical section can overlap almost all of a +given grace period, just so long as it does not overlap the entire +grace period. +As a result, an RCU read-side critical section cannot partition a pair +of RCU grace periods. + +

@@QQ@@ +How long a sequence of grace periods, each separated by an RCU read-side +critical section, would be required to partition the RCU read-side +critical sections at the beginning and end of the chain? +

@@QQA@@ +In theory, an infinite number. +In practice, an unknown number that is sensitive to both implementation +details and timing considerations. +Therefore, even in practice, RCU users must abide by the theoretical rather +than the practical answer. +

@@QQE@@ + +

+Disabling Preemption Does Not Block Grace Periods

+ +

+There was a time when disabling preemption on any given CPU would block +subsequent grace periods. +However, this was an accident of implementation and is not a requirement. +And in the current Linux-kernel implementation, disabling preemption +on a given CPU in fact does not block grace periods, as Oleg Nesterov +demonstrated. + +

+If you need a preempt-disable region to block grace periods, you need to add +rcu_read_lock() and rcu_read_unlock(), for example +as follows: + +

+
+ 1 preempt_disable();
+ 2 rcu_read_lock();
+ 3 do_something();
+ 4 rcu_read_unlock();
+ 5 preempt_enable();
+ 6
+ 7 /* Spinlocks implicitly disable preemption. */
+ 8 spin_lock(&mylock);
+ 9 rcu_read_lock();
+10 do_something();
+11 rcu_read_unlock();
+12 spin_unlock(&mylock);
+
+
+ +

+In theory, you could enter the RCU read-side critical section first, +but it is more efficient to keep the entire RCU read-side critical +section contained in the preempt-disable region as shown above. +Of course, RCU read-side critical sections that extend outside of +preempt-disable regions will work correctly, but such critical sections +can be preempted, which forces rcu_read_unlock() to do +more work. +And no, this is not an invitation to enclose all of your RCU +read-side critical sections within preempt-disable regions, because +doing so would degrade real-time response. + +

+This non-requirement appeared with preemptible RCU. +If you need a grace period that waits on non-preemptible code regions, use +RCU-sched. + +

Parallelism Facts of Life

+ +

+These parallelism facts of life are by no means specific to RCU, but +the RCU implementation must abide by them. +They therefore bear repeating: + +

    +
  1. Any CPU or task may be delayed at any time, + and any attempts to avoid these delays by disabling + preemption, interrupts, or whatever are completely futile. + This is most obvious in preemptible user-level + environments and in virtualized environments (where + a given guest OS's VCPUs can be preempted at any time by + the underlying hypervisor), but can also happen in bare-metal + environments due to ECC errors, NMIs, and other hardware + events. + Although a delay of more than about 20 seconds can result + in splats, the RCU implementation is obligated to use + algorithms that can tolerate extremely long delays, but where + “extremely long” is not long enough to allow + wrap-around when incrementing a 64-bit counter. +
  2. Both the compiler and the CPU can reorder memory accesses. + Where it matters, RCU must use compiler directives and + memory-barrier instructions to preserve ordering. +
  3. Conflicting writes to memory locations in any given cache line + will result in expensive cache misses. + Greater numbers of concurrent writes and more-frequent + concurrent writes will result in more dramatic slowdowns. + RCU is therefore obligated to use algorithms that have + sufficient locality to avoid significant performance and + scalability problems. +
  4. As a rough rule of thumb, only one CPU's worth of processing + may be carried out under the protection of any given exclusive + lock. + RCU must therefore use scalable locking designs. +
  5. Counters are finite, especially on 32-bit systems. + RCU's use of counters must therefore tolerate counter wrap, + or be designed such that counter wrap would take way more + time than a single system is likely to run. + An uptime of ten years is quite possible, a runtime + of a century much less so. + As an example of the latter, RCU's dyntick-idle nesting counter + allows 54 bits for interrupt nesting level (this counter + is 64 bits even on a 32-bit system). + Overflowing this counter requires 254 + half-interrupts on a given CPU without that CPU ever going idle. + If a half-interrupt happened every microsecond, it would take + 570 years of runtime to overflow this counter, which is currently + believed to be an acceptably long time. +
  6. Linux systems can have thousands of CPUs running a single + Linux kernel in a single shared-memory environment. + RCU must therefore pay close attention to high-end scalability. +
+ +

+This last parallelism fact of life means that RCU must pay special +attention to the preceding facts of life. +The idea that Linux might scale to systems with thousands of CPUs would +have been met with some skepticism in the 1990s, but these requirements +would have otherwise have been unsurprising, even in the early 1990s. + +

Quality-of-Implementation Requirements

+ +

+These sections list quality-of-implementation requirements. +Although an RCU implementation that ignores these requirements could +still be used, it would likely be subject to limitations that would +make it inappropriate for industrial-strength production use. +Classes of quality-of-implementation requirements are as follows: + +

    +
  1. Specialization +
  2. Performance and Scalability +
  3. Composability +
  4. Corner Cases +
+ +

+These classes is covered in the following sections. + +

Specialization

+ +

+RCU is and always has been intended primarily for read-mostly situations, as +illustrated by the following figure. +This means that RCU's read-side primitives are optimized, often at the +expense of its update-side primitives. + +

RCUApplicability.svg

+ +

+This focus on read-mostly situations means that RCU must interoperate +with other synchronization primitives. +For example, the add_gp() and remove_gp_synchronous() +examples discussed earlier use RCU to protect readers and locking to +coordinate updaters. +However, the need extends much farther, requiring that a variety of +synchronization primitives be legal within RCU read-side critical sections, +including spinlocks, sequence locks, atomic operations, reference +counters, and memory barriers. + +

@@QQ@@ +What about sleeping locks? +

@@QQA@@ +These are forbidden within Linux-kernel RCU read-side critical sections +because it is not legal to place a quiescent state (in this case, +voluntary context switch) within an RCU read-side critical section. +However, sleeping locks may be used within userspace RCU read-side critical +sections, and also within Linux-kernel sleepable RCU +(SRCU) +read-side critical sections. +In addition, the -rt patchset turns spinlocks into a sleeping locks so +that the corresponding critical sections can be preempted, which +also means that these sleeplockified spinlocks (but not other sleeping locks!) +may be acquire within -rt-Linux-kernel RCU read-side critical sections. + +

+Note that it is legal for a normal RCU read-side critical section +to conditionally acquire a sleeping locks (as in mutex_trylock()), +but only as long as it does not loop indefinitely attempting to +conditionally acquire that sleeping locks. +The key point is that things like mutex_trylock() +either return with the mutex held, or return an error indication if +the mutex was not immediately available. +Either way, mutex_trylock() returns immediately without sleeping. +

@@QQE@@ + +

+It often comes as a surprise that many algorithms do not require a +consistent view of data, but many can function in that mode, +with network routing being the poster child. +Internet routing algorithms take significant time to propagate +updates, so that by the time an update arrives at a given system, +that system has been sending network traffic the wrong way for +a considerable length of time. +Having a few threads continue to send traffic the wrong way for a +few more milliseconds is clearly not a problem: In the worst case, +TCP retransmissions will eventually get the data where it needs to go. +In general, when tracking the state of the universe outside of the +computer, some level of inconsistency must be tolerated due to +speed-of-light delays if nothing else. + +

+Furthermore, uncertainty about external state is inherent in many cases. +For example, a pair of veternarians might use heartbeat to determine +whether or not a given cat was alive. +But how long should they wait after the last heartbeat to decide that +the cat is in fact dead? +Waiting less than 400 milliseconds makes no sense because this would +mean that a relaxed cat would be considered to cycle between death +and life more than 100 times per minute. +Moreover, just as with human beings, a cat's heart might stop for +some period of time, so the exact wait period is a judgment call. +One of our pair of veternarians might wait 30 seconds before pronouncing +the cat dead, while the other might insist on waiting a full minute. +The two veternarians would then disagree on the state of the cat during +the final 30 seconds of the minute following the last heartbeat, as +fancifully illustrated below: + +

2013-08-is-it-dead.png

+ +

+Interestingly enough, this same situation applies to hardware. +When push comes to shove, how do we tell whether or not some +external server has failed? +We send messages to it periodically, and declare it failed if we +don't receive a response within a given period of time. +Policy decisions can usually tolerate short +periods of inconsistency. +The policy was decided some time ago, and is only now being put into +effect, so a few milliseconds of delay is normally inconsequential. + +

+However, there are algorithms that absolutely must see consistent data. +For example, the translation between a user-level SystemV semaphore +ID to the corresponding in-kernel data structure is protected by RCU, +but it is absolutely forbidden to update a semaphore that has just been +removed. +In the Linux kernel, this need for consistency is accommodated by acquiring +spinlocks located in the in-kernel data structure from within +the RCU read-side critical section, and this is indicated by the +green box in the figure above. +Many other techniques may be used, and are in fact used within the +Linux kernel. + +

+In short, RCU is not required to maintain consistency, and other +mechanisms may be used in concert with RCU when consistency is required. +RCU's specialization allows it to do its job extremely well, and its +ability to interoperate with other synchronization mechanisms allows +the right mix of synchronization tools to be used for a given job. + +

Performance and Scalability

+ +

+Energy efficiency is a critical component of performance today, +and Linux-kernel RCU implementations must therefore avoid unnecessarily +awakening idle CPUs. +I cannot claim that this requirement was premeditated. +In fact, I learned of it during a telephone conversation in which I +was given “frank and open” feedback on the importance +of energy efficiency in battery-powered systems and on specific +energy-efficiency shortcomings of the Linux-kernel RCU implementation. +In my experience, the battery-powered embedded community will consider +any unnecessary wakeups to be extremely unfriendly acts. +So much so that mere Linux-kernel-mailing-list posts are +insufficient to vent their ire. + +

+Memory consumption is not particularly important for in most +situations, and has become decreasingly +so as memory sizes have expanded and memory +costs have plummeted. +However, as I learned from Matt Mackall's +bloatwatch +efforts, memory footprint is critically important on single-CPU systems with +non-preemptible (CONFIG_PREEMPT=n) kernels, and thus +tiny RCU +was born. +Josh Triplett has since taken over the small-memory banner with his +Linux kernel tinification +project, which resulted in +SRCU +becoming optional for those kernels not needing it. + +

+The remaining performance requirements are, for the most part, +unsurprising. +For example, in keeping with RCU's read-side specialization, +rcu_dereference() should have negligible overhead (for +example, suppression of a few minor compiler optimizations). +Similarly, in non-preemptible environments, rcu_read_lock() and +rcu_read_unlock() should have exactly zero overhead. + +

+In preemptible environments, in the case where the RCU read-side +critical section was not preempted (as will be the case for the +highest-priority real-time process), rcu_read_lock() and +rcu_read_unlock() should have minimal overhead. +In particular, they should not contain atomic read-modify-write +operations, memory-barrier instructions, preemption disabling, +interrupt disabling, or backwards branches. +However, in the case where the RCU read-side critical section was preempted, +rcu_read_unlock() may acquire spinlocks and disable interrupts. +This is why it is better to nest an RCU read-side critical section +within a preempt-disable region than vice versa, at least in cases +where that critical section is short enough to avoid unduly degrading +real-time latencies. + +

+The synchronize_rcu() grace-period-wait primitive is +optimized for throughput. +It may therefore incur several milliseconds of latency in addition to +the duration of the longest RCU read-side critical section. +On the other hand, multiple concurrent invocations of +synchronize_rcu() are required to use batching optimizations +so that they can be satisfied by a single underlying grace-period-wait +operation. +For example, in the Linux kernel, it is not unusual for a single +grace-period-wait operation to serve more than +1,000 separate invocations +of synchronize_rcu(), thus amortizing the per-invocation +overhead down to nearly zero. +However, the grace-period optimization is also required to avoid +measurable degradation of real-time scheduling and interrupt latencies. + +

+In some cases, the multi-millisecond synchronize_rcu() +latencies are unacceptable. +In these cases, synchronize_rcu_expedited() may be used +instead, reducing the grace-period latency down to a few tens of +microseconds on small systems, at least in cases where the RCU read-side +critical sections are short. +There are currently no special latency requirements for +synchronize_rcu_expedited() on large systems, but, +consistent with the empirical nature of the RCU specification, +that is subject to change. +However, there most definitely are scalability requirements: +A storm of synchronize_rcu_expedited() invocations on 4096 +CPUs should at least make reasonable forward progress. +In return for its shorter latencies, synchronize_rcu_expedited() +is permitted to impose modest degradation of real-time latency +on non-idle online CPUs. +That said, it will likely be necessary to take further steps to reduce this +degradation, hopefully to roughly that of a scheduling-clock interrupt. + +

+There are a number of situations where even +synchronize_rcu_expedited()'s reduced grace-period +latency is unacceptable. +In these situations, the asynchronous call_rcu() can be +used in place of synchronize_rcu() as follows: + +

+
+ 1 struct foo {
+ 2   int a;
+ 3   int b;
+ 4   struct rcu_head rh;
+ 5 };
+ 6
+ 7 static void remove_gp_cb(struct rcu_head *rhp)
+ 8 {
+ 9   struct foo *p = container_of(rhp, struct foo, rh);
+10
+11   kfree(p);
+12 }
+13
+14 bool remove_gp_asynchronous(void)
+15 {
+16   struct foo *p;
+17
+18   spin_lock(&gp_lock);
+19   p = rcu_dereference(gp);
+20   if (!p) {
+21     spin_unlock(&gp_lock);
+22     return false;
+23   }
+24   rcu_assign_pointer(gp, NULL);
+25   call_rcu(&p->rh, remove_gp_cb);
+26   spin_unlock(&gp_lock);
+27   return true;
+28 }
+
+
+ +

+A definition of struct foo is finally needed, and appears +on lines 1-5. +The function remove_gp_cb() is passed to call_rcu() +on line 25, and will be invoked after the end of a subsequent +grace period. +This gets the same effect as remove_gp_synchronous(), +but without forcing the updater to wait for a grace period to elapse. +The call_rcu() function may be used in a number of +situations where neither synchronize_rcu() nor +synchronize_rcu_expedited() would be legal, +including within preempt-disable code, local_bh_disable() code, +interrupt-disable code, and interrupt handlers. +However, even call_rcu() is illegal within NMI handlers. +The callback function (remove_gp_cb() in this case) will be +executed within softirq (software interrupt) environment within the +Linux kernel, +either within a real softirq handler or under the protection +of local_bh_disable(). +In both the Linux kernel and in userspace, it is bad practice to +write an RCU callback function that takes too long. +Long-running operations should be relegated to separate threads or +(in the Linux kernel) workqueues. + +

@@QQ@@ +Why does line 19 use rcu_access_pointer()? +After all, call_rcu() on line 25 stores into the +structure, which would interact badly with concurrent insertions. +Doesn't this mean that rcu_dereference() is required? +

@@QQA@@ +Presumably the ->gp_lock acquired on line 18 excludes +any changes, including any insertions that rcu_dereference() +would protect against. +Therefore, any insertions will be delayed until after ->gp_lock +is released on line 25, which in turn means that +rcu_access_pointer() suffices. +

@@QQE@@ + +

+However, all that remove_gp_cb() is doing is +invoking kfree() on the data element. +This is a common idiom, and is supported by kfree_rcu(), +which allows “fire and forget” operation as shown below: + +

+
+ 1 struct foo {
+ 2   int a;
+ 3   int b;
+ 4   struct rcu_head rh;
+ 5 };
+ 6
+ 7 bool remove_gp_faf(void)
+ 8 {
+ 9   struct foo *p;
+10
+11   spin_lock(&gp_lock);
+12   p = rcu_dereference(gp);
+13   if (!p) {
+14     spin_unlock(&gp_lock);
+15     return false;
+16   }
+17   rcu_assign_pointer(gp, NULL);
+18   kfree_rcu(p, rh);
+19   spin_unlock(&gp_lock);
+20   return true;
+21 }
+
+
+ +

+Note that remove_gp_faf() simply invokes +kfree_rcu() and proceeds, without any need to pay any +further attention to the subsequent grace period and kfree(). +It is permissible to invoke kfree_rcu() from the same +environments as for call_rcu(). +Interestingly enough, DYNIX/ptx had the equivalents of +call_rcu() and kfree_rcu(), but not +synchronize_rcu(). +This was due to the fact that RCU was not heavily used within DYNIX/ptx, +so the very few places that needed something like +synchronize_rcu() simply open-coded it. + +

@@QQ@@ +Earlier it was claimed that call_rcu() and +kfree_rcu() allowed updaters to avoid being blocked +by readers. +But how can that be correct, given that the invocation of the callback +and the freeing of the memory (respectively) must still wait for +a grace period to elapse? +

@@QQA@@ +We could define things this way, but keep in mind that this sort of +definition would say that updates in garbage-collected languages +cannot complete until the next time the garbage collector runs, +which does not seem at all reasonable. +The key point is that in most cases, an updater using either +call_rcu() or kfree_rcu() can proceed to the +next update as soon as it has invoked call_rcu() or +kfree_rcu(), without having to wait for a subsequent +grace period. +

@@QQE@@ + +

+But what if the updater must wait for the completion of code to be +executed after the end of the grace period, but has other tasks +that can be carried out in the meantime? +The polling-style get_state_synchronize_rcu() and +cond_synchronize_rcu() functions may be used for this +purpose, as shown below: + +

+
+ 1 bool remove_gp_poll(void)
+ 2 {
+ 3   struct foo *p;
+ 4   unsigned long s;
+ 5
+ 6   spin_lock(&gp_lock);
+ 7   p = rcu_access_pointer(gp);
+ 8   if (!p) {
+ 9     spin_unlock(&gp_lock);
+10     return false;
+11   }
+12   rcu_assign_pointer(gp, NULL);
+13   spin_unlock(&gp_lock);
+14   s = get_state_synchronize_rcu();
+15   do_something_while_waiting();
+16   cond_synchronize_rcu(s);
+17   kfree(p);
+18   return true;
+19 }
+
+
+ +

+On line 14, get_state_synchronize_rcu() obtains a +“cookie” from RCU, +then line 15 carries out other tasks, +and finally, line 16 returns immediately if a grace period has +elapsed in the meantime, but otherwise waits as required. +The need for get_state_synchronize_rcu and +cond_synchronize_rcu() has appeared quite recently, +so it is too early to tell whether they will stand the test of time. + +

+RCU thus provides a range of tools to allow updaters to strike the +required tradeoff between latency, flexibility and CPU overhead. + +

Composability

+ +

+Composability has received much attention in recent years, perhaps in part +due to the collision of multicore hardware with object-oriented techniques +designed in single-threaded environments for single-threaded use. +And in theory, RCU read-side critical sections may be composed, and in +fact may be nested arbitrarily deeply. +In practice, as with all real-world implementations of composable +constructs, there are limitations. + +

+Implementations of RCU for which rcu_read_lock() +and rcu_read_unlock() generate no code, such as +Linux-kernel RCU when CONFIG_PREEMPT=n, can be +nested arbitrarily deeply. +After all, there is no overhead. +Except that if all these instances of rcu_read_lock() +and rcu_read_unlock() are visible to the compiler, +compilation will eventually fail due to exhausting memory, +mass storage, or user patience, whichever comes first. +If the nesting is not visible to the compiler, as is the case with +mutually recursive functions each in its own translation unit, +stack overflow will result. +If the nesting takes the form of loops, either the control variable +will overflow or (in the Linux kernel) you will get an RCU CPU stall warning. +Nevertheless, this class of RCU implementations is one +of the most composable constructs in existence. + +

+RCU implementations that explicitly track nesting depth +are limited by the nesting-depth counter. +For example, the Linux kernel's preemptible RCU limits nesting to +INT_MAX. +This should suffice for almost all practical purposes. +That said, a consecutive pair of RCU read-side critical sections +between which there is an operation that waits for a grace period +cannot be enclosed in another RCU read-side critical section. +This is because it is not legal to wait for a grace period within +an RCU read-side critical section: To do so would result either +in deadlock or +in RCU implicitly splitting the enclosing RCU read-side critical +section, neither of which is conducive to a long-lived and prosperous +kernel. + +

+In short, although RCU read-side critical sections are highly composable, +care is required in some situations, just as is the case for any other +composable synchronization mechanism. + +

Corner Cases

+ +

+A given RCU workload might have an endless and intense stream of +RCU read-side critical sections, perhaps even so intense that there +was never a point in time during which there was not at least one +RCU read-side critical section in flight. +RCU cannot allow this situation to block grace periods: As long as +all the RCU read-side critical sections are finite, grace periods +must also be finite. + +

+That said, preemptible RCU implementations could potentially result +in RCU read-side critical sections being preempted for long durations, +which has the effect of creating a long-duration RCU read-side +critical section. +This situation can arise only in heavily loaded systems, but systems using +real-time priorities are of course more vulnerable. +Therefore, RCU priority boosting is provided to help deal with this +case. +That said, the exact requirements on RCU priority boosting will likely +evolve as more experience accumulates. + +

+Other workloads might have very high update rates. +Although one can argue that such workloads should instead use +something other than RCU, the fact remains that RCU must +handle such workloads gracefully. +This requirement is another factor driving batching of grace periods, +but it is also the driving force behind the checks for large numbers +of queued RCU callbacks in the call_rcu() code path. +Finally, high update rates should not delay RCU read-side critical +sections, although some read-side delays can occur when using +synchronize_rcu_expedited(), courtesy of this function's use +of try_stop_cpus(). +(In the future, synchronize_rcu_expedited() will be +converted to use lighter-weight inter-processor interrupts (IPIs), +but this will still disturb readers, though to a much smaller degree.) + +

+Although all three of these corner cases were understood in the early +1990s, a simple user-level test consisting of close(open(path)) +in a tight loop +in the early 2000s suddenly provided a much deeper appreciation of the +high-update-rate corner case. +This test also motivated addition of some RCU code to react to high update +rates, for example, if a given CPU finds itself with more than 10,000 +RCU callbacks queued, it will cause RCU to take evasive action by +more aggressively starting grace periods and more aggressively forcing +completion of grace-period processing. +This evasive action causes the grace period to complete more quickly, +but at the cost of restricting RCU's batching optimizations, thus +increasing the CPU overhead incurred by that grace period. + +

+Software-Engineering Requirements

+ +

+Between Murphy's Law and “To err is human”, it is necessary to +guard against mishaps and misuse: + +

    +
  1. It is all too easy to forget to use rcu_read_lock() + everywhere that it is needed, so kernels built with + CONFIG_PROVE_RCU=y will spat if + rcu_dereference() is used outside of an + RCU read-side critical section. + Update-side code can use rcu_dereference_protected(), + which takes a + lockdep expression + to indicate what is providing the protection. + If the indicated protection is not provided, a lockdep splat + is emitted. + +

    + Code shared between readers and updaters can use + rcu_dereference_check(), which also takes a + lockdep expression, and emits a lockdep splat if neither + rcu_read_lock() nor the indicated protection + is in place. + In addition, rcu_dereference_raw() is used in those + (hopefully rare) cases where the required protection cannot + be easily described. + Finally, rcu_read_lock_held() is provided to + allow a function to verify that it has been invoked within + an RCU read-side critical section. + I was made aware of this set of requirements shortly after Thomas + Gleixner audited a number of RCU uses. +

  2. A given function might wish to check for RCU-related preconditions + upon entry, before using any other RCU API. + The rcu_lockdep_assert() does this job, + asserting the expression in kernels having lockdep enabled + and doing nothing otherwise. +
  3. It is also easy to forget to use rcu_assign_pointer() + and rcu_dereference(), perhaps (incorrectly) + substituting a simple assignment. + To catch this sort of error, a given RCU-protected pointer may be + tagged with __rcu, after which running sparse + with CONFIG_SPARSE_RCU_POINTER=y will complain + about simple-assignment accesses to that pointer. + Arnd Bergmann made me aware of this requirement, and also + supplied the needed + patch series. +
  4. Kernels built with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y + will splat if a data element is passed to call_rcu() + twice in a row, without a grace period in between. + (This error is similar to a double free.) + The corresponding rcu_head structures that are + dynamically allocated are automatically tracked, but + rcu_head structures allocated on the stack + must be initialized with init_rcu_head_on_stack() + and cleaned up with destroy_rcu_head_on_stack(). + Similarly, statically allocated non-stack rcu_head + structures must be initialized with init_rcu_head() + and cleaned up with destroy_rcu_head(). + Mathieu Desnoyers made me aware of this requirement, and also + supplied the needed + patch. +
  5. An infinite loop in an RCU read-side critical section will + eventually trigger an RCU CPU stall warning splat. + However, RCU is not obligated to produce this splat + unless there is a grace period waiting on that particular + RCU read-side critical section. + This requirement made itself known in the early 1990s, pretty + much the first time that it was necessary to debug a CPU stall. +
  6. Although it would be very good to detect pointers leaking out + of RCU read-side critical sections, there is currently no + good way of doing this. + One complication is the need to distinguish between pointers + leaking and pointers that have been handed off from RCU to + some other synchronization mechanism, for example, reference + counting. +
  7. In kernels built with CONFIG_RCU_TRACE=y, RCU-related + information is provided via both debugfs and event tracing. +
  8. Open-coded use of rcu_assign_pointer() and + rcu_dereference() to create typical linked + data structures can be surprisingly error-prone. + Therefore, RCU-protected + linked lists + and, more recently, RCU-protected + hash tables + are available. + Many other special-purpose RCU-protected data structures are + available in the Linux kernel and the userspace RCU library. +
  9. Some linked structures are created at compile time, but still + require __rcu checking. + The RCU_POINTER_INITIALIZER() macro serves this + purpose. +
  10. It is not necessary to use rcu_assign_pointer() + when creating linked structures that are to be published via + a single external pointer. + The RCU_INIT_POINTER() macro is provided for + this task and also for assigning NULL pointers + at runtime. +
+ +

+This not a hard-and-fast list: RCU's diagnostic capabilities will +continue to be guided by the number and type of usage bugs found +in real-world RCU usage. + +

Linux Kernel Complications

+ +

+The Linux kernel provides an interesting environment for all kinds of +software, including RCU. +Some of the relevant points of interest are as follows: + +

    +
  1. Configuration. +
  2. Firmware Interface. +
  3. Early Boot. +
  4. + Interrupts and non-maskable interrupts (NMIs). +
  5. Loadable Modules. +
  6. Hotplug CPU. +
  7. Scheduler and RCU. +
  8. Tracing and RCU. +
  9. Energy Efficiency. +
  10. + Performance, Scalability, Response Time, and Reliability. +
+ +

+This list is probably incomplete, but it does give a feel for the +most notable Linux-kernel complications. +Each of the following sections covers one of the above topics. + +

Configuration

+ +

+RCU's goal is automatic configuration, so that almost nobody +needs to worry about RCU's Kconfig options. +And for almost all users, RCU does in fact work well +“out of the box.” + +

+However, there are specialized use cases that are handled by +kernel boot parameters and Kconfig options. +Unfortunately, the Kconfig system will explicitly ask users +about new Kconfig options, which requires almost all of them +be hidden behind a CONFIG_RCU_EXPERT Kconfig option. + +

+This all should be quite obvious, but the fact remains that +Linus Torvalds recently had to +remind +me of this requirement. + +

Firmware Interface

+ +

+In many cases, kernel obtains information about the system from the +firmware, and sometimes things are lost in translation. +Or the translation is accurate, but the original message is bogus. + +

+For example, some systems' firmware overreports the number of CPUs, +sometimes by a large factor. +If RCU naively believed the firmware, as it used to do, +it would create too many per-CPU kthreads. +Although the resulting system will still run correctly, the extra +kthreads needlessly consume memory and can cause confusion +when they show up in ps listings. + +

+RCU must therefore wait for a given CPU to actually come online before +it can allow itself to believe that the CPU actually exists. +The resulting “ghost CPUs” (which are never going to +come online) cause a number of +interesting complications. + +

Early Boot

+ +

+The Linux kernel's boot sequence is an interesting process, +and RCU is used early, even before rcu_init() +is invoked. +In fact, a number of RCU's primitives can be used as soon as the +initial task's task_struct is available and the +boot CPU's per-CPU variables are set up. +The read-side primitives (rcu_read_lock(), +rcu_read_unlock(), rcu_dereference(), +and rcu_access_pointer()) will operate normally very early on, +as will rcu_assign_pointer(). + +

+Although call_rcu() may be invoked at any +time during boot, callbacks are not guaranteed to be invoked until after +the scheduler is fully up and running. +This delay in callback invocation is due to the fact that RCU does not +invoke callbacks until it is fully initialized, and this full initialization +cannot occur until after the scheduler has initialized itself to the +point where RCU can spawn and run its kthreads. +In theory, it would be possible to invoke callbacks earlier, +however, this is not a panacea because there would be severe restrictions +on what operations those callbacks could invoke. + +

+Perhaps surprisingly, synchronize_rcu(), +synchronize_rcu_bh() +(discussed below), +and +synchronize_sched() +will all operate normally +during very early boot, the reason being that there is only one CPU +and preemption is disabled. +This means that the call synchronize_rcu() (or friends) +itself is a quiescent +state and thus a grace period, so the early-boot implementation can +be a no-op. + +

+Both synchronize_rcu_bh() and synchronize_sched() +continue to operate normally through the remainder of boot, courtesy +of the fact that preemption is disabled across their RCU read-side +critical sections and also courtesy of the fact that there is still +only one CPU. +However, once the scheduler starts initializing, preemption is enabled. +There is still only a single CPU, but the fact that preemption is enabled +means that the no-op implementation of synchronize_rcu() no +longer works in CONFIG_PREEMPT=y kernels. +Therefore, as soon as the scheduler starts initializing, the early-boot +fastpath is disabled. +This means that synchronize_rcu() switches to its runtime +mode of operation where it posts callbacks, which in turn means that +any call to synchronize_rcu() will block until the corresponding +callback is invoked. +Unfortunately, the callback cannot be invoked until RCU's runtime +grace-period machinery is up and running, which cannot happen until +the scheduler has initialized itself sufficiently to allow RCU's +kthreads to be spawned. +Therefore, invoking synchronize_rcu() during scheduler +initialization can result in deadlock. + +

@@QQ@@ +So what happens with synchronize_rcu() during +scheduler initialization for CONFIG_PREEMPT=n +kernels? +

@@QQA@@ +In CONFIG_PREEMPT=n kernel, synchronize_rcu() +maps directly to synchronize_sched(). +Therefore, synchronize_rcu() works normally throughout +boot in CONFIG_PREEMPT=n kernels. +However, your code must also work in CONFIG_PREEMPT=y kernels, +so it is still necessary to avoid invoking synchronize_rcu() +during scheduler initialization. +

@@QQE@@ + +

+I learned of these boot-time requirements as a result of a series of +system hangs. + +

Interrupts and NMIs

+ +

+The Linux kernel has interrupts, and RCU read-side critical sections are +legal within interrupt handlers and within interrupt-disabled regions +of code, as are invocations of call_rcu(). + +

+Some Linux-kernel architectures can enter an interrupt handler from +non-idle process context, and then just never leave it, instead stealthily +transitioning back to process context. +This trick is sometimes used to invoke system calls from inside the kernel. +These “half-interrupts” mean that RCU has to be very careful +about how it counts interrupt nesting levels. +I learned of this requirement the hard way during a rewrite +of RCU's dyntick-idle code. + +

+The Linux kernel has non-maskable interrupts (NMIs), and +RCU read-side critical sections are legal within NMI handlers. +Thankfully, RCU update-side primitives, including +call_rcu(), are prohibited within NMI handlers. + +

+The name notwithstanding, some Linux-kernel architectures +can have nested NMIs, which RCU must handle correctly. +Andy Lutomirski +surprised me +with this requirement; +he also kindly surprised me with +an algorithm +that meets this requirement. + +

Loadable Modules

+ +

+The Linux kernel has loadable modules, and these modules can +also be unloaded. +After a given module has been unloaded, any attempt to call +one of its functions results in a segmentation fault. +The module-unload functions must therefore cancel any +delayed calls to loadable-module functions, for example, +any outstanding mod_timer() must be dealt with +via del_timer_sync() or similar. + +

+Unfortunately, there is no way to cancel an RCU callback; +once you invoke call_rcu(), the callback function is +going to eventually be invoked, unless the system goes down first. +Because it is normally considered socially irresponsible to crash the system +in response to a module unload request, we need some other way +to deal with in-flight RCU callbacks. + +

+RCU therefore provides +rcu_barrier(), +which waits until all in-flight RCU callbacks have been invoked. +If a module uses call_rcu(), its exit function should therefore +prevent any future invocation of call_rcu(), then invoke +rcu_barrier(). +In theory, the underlying module-unload code could invoke +rcu_barrier() unconditionally, but in practice this would +incur unacceptable latencies. + +

+Nikita Danilov noted this requirement for an analogous filesystem-unmount +situation, and Dipankar Sarma incorporated rcu_barrier() into RCU. +The need for rcu_barrier() for module unloading became +apparent later. + +

Hotplug CPU

+ +

+The Linux kernel supports CPU hotplug, which means that CPUs +can come and go. +It is of course illegal to use any RCU API member from an offline CPU. +This requirement was present from day one in DYNIX/ptx, but +on the other hand, the Linux kernel's CPU-hotplug implementation +is “interesting.” + +

+The Linux-kernel CPU-hotplug implementation has notifiers that +are used to allow the various kernel subsystems (including RCU) +to respond appropriately to a given CPU-hotplug operation. +Most RCU operations may be invoked from CPU-hotplug notifiers, +including even normal synchronous grace-period operations +such as synchronize_rcu(). +However, expedited grace-period operations such as +synchronize_rcu_expedited() are not supported, +due to the fact that current implementations block CPU-hotplug +operations, which could result in deadlock. + +

+In addition, all-callback-wait operations such as +rcu_barrier() are also not supported, due to the +fact that there are phases of CPU-hotplug operations where +the outgoing CPU's callbacks will not be invoked until after +the CPU-hotplug operation ends, which could also result in deadlock. + +

Scheduler and RCU

+ +

+RCU depends on the scheduler, and the scheduler uses RCU to +protect some of its data structures. +This means the scheduler is forbidden from acquiring +the runqueue locks and the priority-inheritance locks +in the middle of an outermost RCU read-side critical section unless +it also releases them before exiting that same +RCU read-side critical section. +This same prohibition also applies to any lock that is acquired +while holding any lock to which this prohibition applies. +Violating this rule results in deadlock. + +

+For RCU's part, the preemptible-RCU rcu_read_unlock() +implementation must be written carefully to avoid similar deadlocks. +In particular, rcu_read_unlock() must tolerate an +interrupt where the interrupt handler invokes both +rcu_read_lock() and rcu_read_unlock(). +This possibility requires rcu_read_unlock() to use +negative nesting levels to avoid destructive recursion via +interrupt handler's use of RCU. + +

+This pair of mutual scheduler-RCU requirements came as a +complete surprise. + +

+As noted above, RCU makes use of kthreads, and it is necessary to +avoid excessive CPU-time accumulation by these kthreads. +This requirement was no surprise, but RCU's violation of it +when running context-switch-heavy workloads when built with +CONFIG_NO_HZ_FULL=y +did come as a surprise [PDF]. +RCU has made good progress towards meeting this requirement, even +for context-switch-have CONFIG_NO_HZ_FULL=y workloads, +but there is room for further improvement. + +

Tracing and RCU

+ +

+It is possible to use tracing on RCU code, but tracing itself +uses RCU. +For this reason, rcu_dereference_raw_notrace() +is provided for use by tracing, which avoids the destructive +recursion that could otherwise ensue. +This API is also used by virtualization in some architectures, +where RCU readers execute in environments in which tracing +cannot be used. +The tracing folks both located the requirement and provided the +needed fix, so this surprise requirement was relatively painless. + +

Energy Efficiency

+ +

+Interrupting idle CPUs is considered socially unacceptable, +especially by people with battery-powered embedded systems. +RCU therefore conserves energy by detecting which CPUs are +idle, including tracking CPUs that have been interrupted from idle. +This is a large part of the energy-efficiency requirement, +so I learned of this via an irate phone call. + +

+Because RCU avoids interrupting idle CPUs, it is illegal to +execute an RCU read-side critical section on an idle CPU. +(Kernels built with CONFIG_PROVE_RCU=y will splat +if you try it.) +The RCU_NONIDLE() macro and _rcuidle +event tracing is provided to work around this restriction. +In addition, rcu_is_watching() may be used to +test whether or not it is currently legal to run RCU read-side +critical sections on this CPU. +I learned of the need for diagnostics on the one hand +and RCU_NONIDLE() on the other while inspecting +idle-loop code. +Steven Rostedt supplied _rcuidle event tracing, +which is used quite heavily in the idle loop. + +

+It is similarly socially unacceptable to interrupt an +nohz_full CPU running in userspace. +RCU must therefore track nohz_full userspace +execution. +And in +CONFIG_NO_HZ_FULL_SYSIDLE=y +kernels, RCU must separately track idle CPUs on the one hand and +CPUs that are either idle or executing in userspace on the other. +In both cases, RCU must be able to sample state at two points in +time, and be able to determine whether or not some other CPU spent +any time idle and/or executing in userspace. + +

+These energy-efficiency requirements have proven quite difficult to +understand and to meet, for example, there have been more than five +clean-sheet rewrites of RCU's energy-efficiency code, the last of +which was finally able to demonstrate +real energy savings running on real hardware [PDF]. +As noted earlier, +I learned of many of these requirements via angry phone calls: +Flaming me on the Linux-kernel mailing list was apparently not +sufficient to fully vent their ire at RCU's energy-efficiency bugs! + +

+Performance, Scalability, Response Time, and Reliability

+ +

+Expanding on the +earlier discussion, +RCU is used heavily by hot code paths in performance-critical +portions of the Linux kernel's networking, security, virtualization, +and scheduling code paths. +RCU must therefore use efficient implementations, especially in its +read-side primitives. +To that end, it would be good if preemptible RCU's implementation +of rcu_read_lock() could be inlined, however, doing +this requires resolving #include issues with the +task_struct structure. + +

+The Linux kernel supports hardware configurations with up to +4096 CPUs, which means that RCU must be extremely scalable. +Algorithms that involve frequent acquisitions of global locks or +frequent atomic operations on global variables simply cannot be +tolerated within the RCU implementation. +RCU therefore makes heavy use of a combining tree based on the +rcu_node structure. +RCU is required to tolerate all CPUs continuously invoking any +combination of RCU's runtime primitives with minimal per-operation +overhead. +In fact, in many cases, increasing load must decrease the +per-operation overhead, witness the batching optimizations for +synchronize_rcu(), call_rcu(), +synchronize_rcu_expedited(), and rcu_barrier(). +As a general rule, RCU must cheerfully accept whatever the +rest of the Linux kernel decides to throw at it. + +

+The Linux kernel is used for real-time workloads, especially +in conjunction with the +-rt patchset. +The real-time-latency response requirements are such that the +traditional approach of disabling preemption across RCU +read-side critical sections is inappropriate. +Kernels built with CONFIG_PREEMPT=y therefore +use an RCU implementation that allows RCU read-side critical +sections to be preempted. +This requirement made its presence known after users made it +clear that an earlier +real-time patch +did not meet their needs, in conjunction with some +RCU issues +encountered by a very early version of the -rt patchset. + +

+In addition, RCU must make do with a sub-100-microsecond real-time latency +budget. +In fact, on smaller systems with the -rt patchset, the Linux kernel +provides sub-20-microsecond real-time latencies for the whole kernel, +including RCU. +RCU's scalability and latency must therefore be sufficient for +these sorts of configurations. +To my surprise, the sub-100-microsecond real-time latency budget + +applies to even the largest systems [PDF], +up to and including systems with 4096 CPUs. +This real-time requirement motivated the grace-period kthread, which +also simplified handling of a number of race conditions. + +

+Finally, RCU's status as a synchronization primitive means that +any RCU failure can result in arbitrary memory corruption that can be +extremely difficult to debug. +This means that RCU must be extremely reliable, which in +practice also means that RCU must have an aggressive stress-test +suite. +This stress-test suite is called rcutorture. + +

+Although the need for rcutorture was no surprise, +the current immense popularity of the Linux kernel is posing +interesting—and perhaps unprecedented—validation +challenges. +To see this, keep in mind that there are well over one billion +instances of the Linux kernel running today, given Android +smartphones, Linux-powered televisions, and servers. +This number can be expected to increase sharply with the advent of +the celebrated Internet of Things. + +

+Suppose that RCU contains a race condition that manifests on average +once per million years of runtime. +This bug will be occurring about three times per day across +the installed base. +RCU could simply hide behind hardware error rates, given that no one +should really expect their smartphone to last for a million years. +However, anyone taking too much comfort from this thought should +consider the fact that in most jurisdictions, a successful multi-year +test of a given mechanism, which might include a Linux kernel, +suffices for a number of types of safety-critical certifications. +In fact, rumor has it that the Linux kernel is already being used +in production for safety-critical applications. +I don't know about you, but I would feel quite bad if a bug in RCU +killed someone. +Which might explain my recent focus on validation and verification. + +

Other RCU Flavors

+ +

+One of the more surprising things about RCU is that there are now +no fewer than five flavors, or API families. +In addition, the primary flavor that has been the sole focus up to +this point has two different implementations, non-preemptible and +preemptible. +The other four flavors are listed below, with requirements for each +described in a separate section. + +

    +
  1. Bottom-Half Flavor +
  2. Sched Flavor +
  3. Sleepable RCU +
  4. Tasks RCU +
+ +

Bottom-Half Flavor

+ +

+The softirq-disable (AKA “bottom-half”, +hence the “_bh” abbreviations) +flavor of RCU, or RCU-bh, was developed by +Dipankar Sarma to provide a flavor of RCU that could withstand the +network-based denial-of-service attacks researched by Robert +Olsson. +These attacks placed so much networking load on the system +that some of the CPUs never exited softirq execution, +which in turn prevented those CPUs from ever executing a context switch, +which, in the RCU implementation of that time, prevented grace periods +from ever ending. +The result was an out-of-memory condition and a system hang. + +

+The solution was the creation of RCU-bh, which does +local_bh_disable() +across its read-side critical sections, and which uses the transition +from one type of softirq processing to another as a quiescent state +in addition to context switch, idle, user mode, and offline. +This means that RCU-bh grace periods can complete even when some of +the CPUs execute in softirq indefinitely, thus allowing algorithms +based on RCU-bh to withstand network-based denial-of-service attacks. + +

+Because +rcu_read_lock_bh() and rcu_read_unlock_bh() +disable and re-enable softirq handlers, any attempt to start a softirq +handlers during the +RCU-bh read-side critical section will be deferred. +In this case, rcu_read_unlock_bh() +will invoke softirq processing, which can take considerable time. +One can of course argue that this softirq overhead should be associated +with the code following the RCU-bh read-side critical section rather +than rcu_read_unlock_bh(), but the fact +is that most profiling tools cannot be expected to make this sort +of fine distinction. +For example, suppose that a three-millisecond-long RCU-bh read-side +critical section executes during a time of heavy networking load. +There will very likely be an attempt to invoke at least one softirq +handler during that three milliseconds, but any such invocation will +be delayed until the time of the rcu_read_unlock_bh(). +This can of course make it appear at first glance as if +rcu_read_unlock_bh() was executing very slowly. + +

+The +RCU-bh API +includes +rcu_read_lock_bh(), +rcu_read_unlock_bh(), +rcu_dereference_bh(), +rcu_dereference_bh_check(), +synchronize_rcu_bh(), +synchronize_rcu_bh_expedited(), +call_rcu_bh(), +rcu_barrier_bh(), and +rcu_read_lock_bh_held(). + +

Sched Flavor

+ +

+Before preemptible RCU, waiting for an RCU grace period had the +side effect of also waiting for all pre-existing interrupt +and NMI handlers. +However, there are legitimate preemptible-RCU implementations that +do not have this property, given that any point in the code outside +of an RCU read-side critical section can be a quiescent state. +Therefore, RCU-sched was created, which follows “classic” +RCU in that an RCU-sched grace period waits for for pre-existing +interrupt and NMI handlers. +In kernels built with CONFIG_PREEMPT=n, the RCU and RCU-sched +APIs have identical implementations, while kernels built with +CONFIG_PREEMPT=y provide a separate implementation for each. + +

+Note well that in CONFIG_PREEMPT=y kernels, +rcu_read_lock_sched() and rcu_read_unlock_sched() +disable and re-enable preemption, respectively. +This means that if there was a preemption attempt during the +RCU-sched read-side critical section, rcu_read_unlock_sched() +will enter the scheduler, with all the latency and overhead entailed. +Just as with rcu_read_unlock_bh(), this can make it look +as if rcu_read_unlock_sched() was executing very slowly. +However, the highest-priority task won't be preempted, so that task +will enjoy low-overhead rcu_read_unlock_sched() invocations. + +

+The +RCU-sched API +includes +rcu_read_lock_sched(), +rcu_read_unlock_sched(), +rcu_read_lock_sched_notrace(), +rcu_read_unlock_sched_notrace(), +rcu_dereference_sched(), +rcu_dereference_sched_check(), +synchronize_sched(), +synchronize_rcu_sched_expedited(), +call_rcu_sched(), +rcu_barrier_sched(), and +rcu_read_lock_sched_held(). +However, anything that disables preemption also marks an RCU-sched +read-side critical section, including +preempt_disable() and preempt_enable(), +local_irq_save() and local_irq_restore(), +and so on. + +

Sleepable RCU

+ +

+For well over a decade, someone saying “I need to block within +an RCU read-side critical section” was a reliable indication +that this someone did not understand RCU. +After all, if you are always blocking in an RCU read-side critical +section, you can probably afford to use a higher-overhead synchronization +mechanism. +However, that changed with the advent of the Linux kernel's notifiers, +whose RCU read-side critical +sections almost never sleep, but sometimes need to. +This resulted in the introduction of +sleepable RCU, +or SRCU. + +

+SRCU allows different domains to be defined, with each such domain +defined by an instance of an srcu_struct structure. +A pointer to this structure must be passed in to each SRCU function, +for example, synchronize_srcu(&ss), where +ss is the srcu_struct structure. +The key benefit of these domains is that a slow SRCU reader in one +domain does not delay an SRCU grace period in some other domain. +That said, one consequence of these domains is that read-side code +must pass a “cookie” from srcu_read_lock() +to srcu_read_unlock(), for example, as follows: + +

+
+ 1 int idx;
+ 2
+ 3 idx = srcu_read_lock(&ss);
+ 4 do_something();
+ 5 srcu_read_unlock(&ss, idx);
+
+
+ +

+As noted above, it is legal to block within SRCU read-side critical sections, +however, with great power comes great responsibility. +If you block forever in one of a given domain's SRCU read-side critical +sections, then that domain's grace periods will also be blocked forever. +Of course, one good way to block forever is to deadlock, which can +happen if any operation in a given domain's SRCU read-side critical +section can block waiting, either directly or indirectly, for that domain's +grace period to elapse. +For example, this results in a self-deadlock: + +

+
+ 1 int idx;
+ 2
+ 3 idx = srcu_read_lock(&ss);
+ 4 do_something();
+ 5 synchronize_srcu(&ss);
+ 6 srcu_read_unlock(&ss, idx);
+
+
+ +

+However, if line 5 acquired a mutex that was held across +a synchronize_srcu() for domain ss, +deadlock would still be possible. +Furthermore, if line 5 acquired a mutex that was held across +a synchronize_srcu() for some other domain ss1, +and if an ss1-domain SRCU read-side critical section +acquired another mutex that was held across as ss-domain +synchronize_srcu(), +deadlock would again be possible. +Such a deadlock cycle could extend across an arbitrarily large number +of different SRCU domains. +Again, with great power comes great responsibility. + +

+Unlike the other RCU flavors, SRCU read-side critical sections can +run on idle and even offline CPUs. +This ability requires that srcu_read_lock() and +srcu_read_unlock() contain memory barriers, which means +that SRCU readers will run a bit slower than would RCU readers. +It also motivates the smp_mb__after_srcu_read_unlock() +API, which, in combination with srcu_read_unlock(), +guarantees a full memory barrier. + +

+The +SRCU API +includes +srcu_read_lock(), +srcu_read_unlock(), +srcu_dereference(), +srcu_dereference_check(), +synchronize_srcu(), +synchronize_srcu_expedited(), +call_srcu(), +srcu_barrier(), and +srcu_read_lock_held(). +It also includes +DEFINE_SRCU(), +DEFINE_STATIC_SRCU(), and +init_srcu_struct() +APIs for defining and initializing srcu_struct structures. + +

Tasks RCU

+ +

+Some forms of tracing use “tramopolines” to handle the +binary rewriting required to install different types of probes. +It would be good to be able to free old trampolines, which sounds +like a job for some form of RCU. +However, because it is necessary to be able to install a trace +anywhere in the code, it is not possible to use read-side markers +such as rcu_read_lock() and rcu_read_unlock(). +In addition, it does not work to have these markers in the trampoline +itself, because there would need to be instructions following +rcu_read_unlock(). +Although synchronize_rcu() would guarantee that execution +reached the rcu_read_unlock(), it would not be able to +guarantee that execution had completely left the trampoline. + +

+The solution, in the form of +Tasks RCU, +is to have implicit +read-side critical sections that are delimited by voluntary context +switches, that is, calls to schedule(), +cond_resched_rcu_qs(), and +synchronize_rcu_tasks(). +In addition, transitions to and from userspace execution also delimit +tasks-RCU read-side critical sections. + +

+The tasks-RCU API is quite compact, consisting only of +call_rcu_tasks(), +synchronize_rcu_tasks(), and +rcu_barrier_tasks(). + +

Possible Future Changes

+ +

+One of the tricks that RCU uses to attain update-side scalability is +to increase grace-period latency with increasing numbers of CPUs. +If this becomes a serious problem, it will be necessary to rework the +grace-period state machine so as to avoid the need for the additional +latency. + +

+Expedited grace periods scan the CPUs, so their latency and overhead +increases with increasing numbers of CPUs. +If this becomes a serious problem on large systems, it will be necessary +to do some redesign to avoid this scalability problem. + +

+RCU disables CPU hotplug in a few places, perhaps most notably in the +expedited grace-period and rcu_barrier() operations. +If there is a strong reason to use expedited grace periods in CPU-hotplug +notifiers, it will be necessary to avoid disabling CPU hotplug. +This would introduce some complexity, so there had better be a very +good reason. + +

+The tradeoff between grace-period latency on the one hand and interruptions +of other CPUs on the other hand may need to be re-examined. +The desire is of course for zero grace-period latency as well as zero +interprocessor interrupts undertaken during an expedited grace period +operation. +While this ideal is unlikely to be achievable, it is quite possible that +further improvements can be made. + +

+The multiprocessor implementations of RCU use a combining tree that +groups CPUs so as to reduce lock contention and increase cache locality. +However, this combining tree does not spread its memory across NUMA +nodes nor does it align the CPU groups with hardware features such +as sockets or cores. +Such spreading and alignment is currently believed to be unnecessary +because the hotpath read-side primitives do not access the combining +tree, nor does call_rcu() in the common case. +If you believe that your architecture needs such spreading and alignment, +then your architecture should also benefit from the +rcutree.rcu_fanout_leaf boot parameter, which can be set +to the number of CPUs in a socket, NUMA node, or whatever. +If the number of CPUs is too large, use a fraction of the number of +CPUs. +If the number of CPUs is a large prime number, well, that certainly +is an “interesting” architectural choice! +More flexible arrangements might be considered, but only if +rcutree.rcu_fanout_leaf has proven inadequate, and only +if the inadequacy has been demonstrated by a carefully run and +realistic system-level workload. + +

+Please note that arrangements that require RCU to remap CPU numbers will +require extremely good demonstration of need and full exploration of +alternatives. + +

+There is an embarrassingly large number of flavors of RCU, and this +number has been increasing over time. +Perhaps it will be possible to combine some at some future date. + +

+RCU's various kthreads are reasonably recent additions. +It is quite likely that adjustments will be required to more gracefully +handle extreme loads. +It might also be necessary to be able to relate CPU utilization by +RCU's kthreads and softirq handlers to the code that instigated this +CPU utilization. +For example, RCU callback overhead might be charged back to the +originating call_rcu() instance, though probably not +in production kernels. + +

Summary

+ +

+This document has presented more than two decade's worth of RCU +requirements. +Given that the requirements keep changing, this will not be the last +word on this subject, but at least it serves to get an important +subset of the requirements set forth. + +

Acknowledgments

+ +I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar, +Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and +Andy Lutomirski for their help in rendering +this article human readable, and to Michelle Rankin for her support +of this effort. +Other contributions are acknowledged in the Linux kernel's git archive. +The cartoon is copyright (c) 2013 by Melissa Broussard, +and is provided +under the terms of the Creative Commons Attribution-Share Alike 3.0 +United States license. + +

@@QQAL@@ + + diff --git a/Documentation/RCU/Design/htmlqqz.sh b/Documentation/RCU/Design/htmlqqz.sh new file mode 100755 index 000000000000..d354f069559b --- /dev/null +++ b/Documentation/RCU/Design/htmlqqz.sh @@ -0,0 +1,108 @@ +#!/bin/sh +# +# Usage: sh htmlqqz.sh file +# +# Extracts and converts quick quizzes in a proto-HTML document file.htmlx. +# Commands, all of which must be on a line by themselves: +# +# "

@@QQ@@": Start of a quick quiz. +# "

@@QQA@@": Start of a quick-quiz answer. +# "

@@QQE@@": End of a quick-quiz answer, and thus of the quick quiz. +# "

@@QQAL@@": Place to put quick-quiz answer list. +# +# Places the result in file.html. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, you can access it online at +# http://www.gnu.org/licenses/gpl-2.0.html. +# +# Copyright (c) 2013 Paul E. McKenney, IBM Corporation. + +fn=$1 +if test ! -r $fn.htmlx +then + echo "Error: $fn.htmlx unreadable." + exit 1 +fi + +echo "" > $fn.html +echo "" >> $fn.html +awk < $fn.htmlx >> $fn.html ' + +state == "" && $1 != "

@@QQ@@" && $1 != "

@@QQAL@@" { + print $0; + if ($0 ~ /^

@@QQ/) + print "Bad Quick Quiz command: " NR " (expected

@@QQ@@ or

@@QQAL@@)." > "/dev/stderr" + next; +} + +state == "" && $1 == "

@@QQ@@" { + qqn++; + qqlineno = NR; + haveqq = 1; + state = "qq"; + print "

Quick Quiz " qqn ":" + next; +} + +state == "qq" && $1 != "

@@QQA@@" { + qq[qqn] = qq[qqn] $0 "\n"; + print $0 + if ($0 ~ /^

@@QQ/) + print "Bad Quick Quiz command: " NR ". (expected

@@QQA@@)" > "/dev/stderr" + next; +} + +state == "qq" && $1 == "

@@QQA@@" { + state = "qqa"; + print "
Answer" + next; +} + +state == "qqa" && $1 != "

@@QQE@@" { + qqa[qqn] = qqa[qqn] $0 "\n"; + if ($0 ~ /^

@@QQ/) + print "Bad Quick Quiz command: " NR " (expected

@@QQE@@)." > "/dev/stderr" + next; +} + +state == "qqa" && $1 == "

@@QQE@@" { + state = ""; + next; +} + +state == "" && $1 == "

@@QQAL@@" { + haveqq = ""; + print "

" + print "Answers to Quick Quizzes

" + print ""; + for (i = 1; i <= qqn; i++) { + print "" + print "

Quick Quiz " i ":" + print qq[i]; + print ""; + print "

Answer:" + print qqa[i]; + print ""; + print "

Back to Quick Quiz " i "." + print ""; + } + next; +} + +END { + if (state != "") + print "Unterminated Quick Quiz: " qqlineno "." > "/dev/stderr" + else if (haveqq) + print "Missing \"

@@QQAL@@\", no Quick Quiz." > "/dev/stderr" +}' -- cgit v1.2.3 From 701e80312fd10270f9c44371e5a229d37a9ae172 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Oct 2015 15:06:44 -0700 Subject: Documentation: Record bottom-bit-zero guarantee for ->next This commit records RCU's guarantee that the bottom bit of the rcu_head structure's ->next field will remain zero for callbacks posted via call_rcu(), but not necessarily for kfree_rcu() or some possible future call_rcu_lazy() variant that might one day be created for energy-efficiency purposese. Signed-off-by: Paul E. McKenney [ paulmck: Updates URLs as suggested by Josh Triplett. ] --- .../RCU/Design/Requirements/Requirements.html | 43 ++++++++++++++++++++++ .../RCU/Design/Requirements/Requirements.htmlx | 43 ++++++++++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 36de7aaa941e..871f627b7713 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -1678,6 +1678,7 @@ Some of the relevant points of interest are as follows:

  • Scheduler and RCU.
  • Tracing and RCU.
  • Energy Efficiency. +
  • Memory Efficiency.
  • Performance, Scalability, Response Time, and Reliability. @@ -2006,6 +2007,48 @@ I learned of many of these requirements via angry phone calls: Flaming me on the Linux-kernel mailing list was apparently not sufficient to fully vent their ire at RCU's energy-efficiency bugs! +

    Memory Efficiency

    + +

    +Although small-memory non-realtime systems can simply use Tiny RCU, +code size is only one aspect of memory efficiency. +Another aspect is the size of the rcu_head structure +used by call_rcu() and kfree_rcu(). +Although this structure contains nothing more than a pair of pointers, +it does appear in many RCU-protected data structures, including +some that are size critical. +The page structure is a case in point, as evidenced by +the many occurrences of the union keyword within that structure. + +

    +This need for memory efficiency is one reason that RCU uses hand-crafted +singly linked lists to track the rcu_head structures that +are waiting for a grace period to elapse. +It is also the reason why rcu_head structures do not contain +debug information, such as fields tracking the file and line of the +call_rcu() or kfree_rcu() that posted them. +Although this information might appear in debug-only kernel builds at some +point, in the meantime, the ->func field will often provide +the needed debug information. + +

    +However, in some cases, the need for memory efficiency leads to even +more extreme measures. +Returning to the page structure, the rcu_head field +shares storage with a great many other structures that are used at +various points in the corresponding page's lifetime. +In order to correctly resolve certain +race conditions, +the Linux kernel's memory-management subsystem needs a particular bit +to remain zero during all phases of grace-period processing, +and that bit happens to map to the bottom bit of the +rcu_head structure's ->next field. +RCU makes this guarantee as long as call_rcu() +is used to post the callback, as opposed to kfree_rcu() +or some future “lazy” +variant of call_rcu() that might one day be created for +energy-efficiency purposes. +

    Performance, Scalability, Response Time, and Reliability

    diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx index 1168010c39fe..a544db4646c6 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.htmlx +++ b/Documentation/RCU/Design/Requirements/Requirements.htmlx @@ -1837,6 +1837,7 @@ Some of the relevant points of interest are as follows:
  • Scheduler and RCU.
  • Tracing and RCU.
  • Energy Efficiency. +
  • Memory Efficiency.
  • Performance, Scalability, Response Time, and Reliability. @@ -2173,6 +2174,48 @@ I learned of many of these requirements via angry phone calls: Flaming me on the Linux-kernel mailing list was apparently not sufficient to fully vent their ire at RCU's energy-efficiency bugs! +

    Memory Efficiency

    + +

    +Although small-memory non-realtime systems can simply use Tiny RCU, +code size is only one aspect of memory efficiency. +Another aspect is the size of the rcu_head structure +used by call_rcu() and kfree_rcu(). +Although this structure contains nothing more than a pair of pointers, +it does appear in many RCU-protected data structures, including +some that are size critical. +The page structure is a case in point, as evidenced by +the many occurrences of the union keyword within that structure. + +

    +This need for memory efficiency is one reason that RCU uses hand-crafted +singly linked lists to track the rcu_head structures that +are waiting for a grace period to elapse. +It is also the reason why rcu_head structures do not contain +debug information, such as fields tracking the file and line of the +call_rcu() or kfree_rcu() that posted them. +Although this information might appear in debug-only kernel builds at some +point, in the meantime, the ->func field will often provide +the needed debug information. + +

    +However, in some cases, the need for memory efficiency leads to even +more extreme measures. +Returning to the page structure, the rcu_head field +shares storage with a great many other structures that are used at +various points in the corresponding page's lifetime. +In order to correctly resolve certain +race conditions, +the Linux kernel's memory-management subsystem needs a particular bit +to remain zero during all phases of grace-period processing, +and that bit happens to map to the bottom bit of the +rcu_head structure's ->next field. +RCU makes this guarantee as long as call_rcu() +is used to post the callback, as opposed to kfree_rcu() +or some future “lazy” +variant of call_rcu() that might one day be created for +energy-efficiency purposes. +

    Performance, Scalability, Response Time, and Reliability

    -- cgit v1.2.3 From 01d3ad3834891f19a2620a105415feac93296eeb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Oct 2015 15:35:35 -0700 Subject: documentation: Cover requirements controlling stall warnings This commit adds verbiage on boot and sysfs parameters that can be used to control RCU CPU stall warnings, both to change the timeout and to suppress these warnings entirely. Signed-off-by: Paul E. McKenney --- .../RCU/Design/Requirements/Requirements.html | 25 +++++++++++++++++++++- .../RCU/Design/Requirements/Requirements.htmlx | 25 +++++++++++++++++++++- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 871f627b7713..cc5b587c0ec5 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -1618,12 +1618,35 @@ guard against mishaps and misuse: supplied the needed patch.
  • An infinite loop in an RCU read-side critical section will - eventually trigger an RCU CPU stall warning splat. + eventually trigger an RCU CPU stall warning splat, with + the duration of “eventually” being controlled by the + RCU_CPU_STALL_TIMEOUT Kconfig option, or, + alternatively, by the + rcupdate.rcu_cpu_stall_timeout boot/sysfs + parameter. However, RCU is not obligated to produce this splat unless there is a grace period waiting on that particular RCU read-side critical section. +

    + Some extreme workloads might intentionally delay + RCU grace periods, and systems running those workloads can + be booted with rcupdate.rcu_cpu_stall_suppress + to suppress the splats. + This kernel parameter may also be set via sysfs. + Furthermore, RCU CPU stall warnings are counter-productive + during sysrq dumps and during panics. + RCU therefore supplies the rcu_sysrq_start() and + rcu_sysrq_end() API members to be called before + and after long sysrq dumps. + RCU also supplies the rcu_panic() notifier that is + automatically invoked at the beginning of a panic to suppress + further RCU CPU stall warnings. + +

    This requirement made itself known in the early 1990s, pretty much the first time that it was necessary to debug a CPU stall. + That said, the initial implementation in DYNIX/ptx was quite + generic in comparison with that of Linux.

  • Although it would be very good to detect pointers leaking out of RCU read-side critical sections, there is currently no good way of doing this. diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx index a544db4646c6..23524d75a3c3 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.htmlx +++ b/Documentation/RCU/Design/Requirements/Requirements.htmlx @@ -1777,12 +1777,35 @@ guard against mishaps and misuse: supplied the needed patch.
  • An infinite loop in an RCU read-side critical section will - eventually trigger an RCU CPU stall warning splat. + eventually trigger an RCU CPU stall warning splat, with + the duration of “eventually” being controlled by the + RCU_CPU_STALL_TIMEOUT Kconfig option, or, + alternatively, by the + rcupdate.rcu_cpu_stall_timeout boot/sysfs + parameter. However, RCU is not obligated to produce this splat unless there is a grace period waiting on that particular RCU read-side critical section. +

    + Some extreme workloads might intentionally delay + RCU grace periods, and systems running those workloads can + be booted with rcupdate.rcu_cpu_stall_suppress + to suppress the splats. + This kernel parameter may also be set via sysfs. + Furthermore, RCU CPU stall warnings are counter-productive + during sysrq dumps and during panics. + RCU therefore supplies the rcu_sysrq_start() and + rcu_sysrq_end() API members to be called before + and after long sysrq dumps. + RCU also supplies the rcu_panic() notifier that is + automatically invoked at the beginning of a panic to suppress + further RCU CPU stall warnings. + +

    This requirement made itself known in the early 1990s, pretty much the first time that it was necessary to debug a CPU stall. + That said, the initial implementation in DYNIX/ptx was quite + generic in comparison with that of Linux.

  • Although it would be very good to detect pointers leaking out of RCU read-side critical sections, there is currently no good way of doing this. -- cgit v1.2.3 From 0825458b1dbc39ec6840ee2e45b1fedb1b4b4ca1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Oct 2015 15:43:31 -0700 Subject: documentation: Composability analogies This commit expands on RCU's composability by comparing it to that of transactional memory and of locking. Signed-off-by: Paul E. McKenney --- Documentation/RCU/Design/Requirements/Requirements.html | 8 ++++++++ Documentation/RCU/Design/Requirements/Requirements.htmlx | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index cc5b587c0ec5..105247149975 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -1494,6 +1494,14 @@ in RCU implicitly splitting the enclosing RCU read-side critical section, neither of which is conducive to a long-lived and prosperous kernel. +

    +It is worth noting that RCU is not alone in limiting composability. +For example, many transactional-memory implementations prohibit +composing a pair of transactions separated by an irrevocable +operation (for example, a network receive operation). +For another example, lock-based critical sections can be composed +surprisingly freely, but only if deadlock is avoided. +

    In short, although RCU read-side critical sections are highly composable, care is required in some situations, just as is the case for any other diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx index 23524d75a3c3..5b76e21fa092 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.htmlx +++ b/Documentation/RCU/Design/Requirements/Requirements.htmlx @@ -1653,6 +1653,14 @@ in RCU implicitly splitting the enclosing RCU read-side critical section, neither of which is conducive to a long-lived and prosperous kernel. +

    +It is worth noting that RCU is not alone in limiting composability. +For example, many transactional-memory implementations prohibit +composing a pair of transactions separated by an irrevocable +operation (for example, a network receive operation). +For another example, lock-based critical sections can be composed +surprisingly freely, but only if deadlock is avoided. +

    In short, although RCU read-side critical sections are highly composable, care is required in some situations, just as is the case for any other -- cgit v1.2.3 From a4b575627e8d1a2498a921940813266d4e26ff56 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Oct 2015 15:52:25 -0700 Subject: documentation: Expand on scheduler/RCU deadlock requirements This commit adds a second option for avoiding scheduler/RCU deadlocks, namely that preemption be disabled across the entire RCU read-side critical section in question. Signed-off-by: Paul E. McKenney --- Documentation/RCU/Design/Requirements/Requirements.html | 14 +++++++++----- Documentation/RCU/Design/Requirements/Requirements.htmlx | 14 +++++++++----- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 105247149975..ab513ed229d7 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -1942,12 +1942,16 @@ RCU depends on the scheduler, and the scheduler uses RCU to protect some of its data structures. This means the scheduler is forbidden from acquiring the runqueue locks and the priority-inheritance locks -in the middle of an outermost RCU read-side critical section unless -it also releases them before exiting that same -RCU read-side critical section. -This same prohibition also applies to any lock that is acquired +in the middle of an outermost RCU read-side critical section unless either +(1) it releases them before exiting that same +RCU read-side critical section, or +(2) preemption is disabled across +that entire RCU read-side critical section. +This same prohibition also applies (recursively!) to any lock that is acquired while holding any lock to which this prohibition applies. -Violating this rule results in deadlock. +Adhering to this rule prevents preemptible RCU from invoking +rcu_read_unlock_special() while either runqueue or +priority-inheritance locks are held, thus avoiding deadlock.

    For RCU's part, the preemptible-RCU rcu_read_unlock() diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx index 5b76e21fa092..f7c817f235e0 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.htmlx +++ b/Documentation/RCU/Design/Requirements/Requirements.htmlx @@ -2109,12 +2109,16 @@ RCU depends on the scheduler, and the scheduler uses RCU to protect some of its data structures. This means the scheduler is forbidden from acquiring the runqueue locks and the priority-inheritance locks -in the middle of an outermost RCU read-side critical section unless -it also releases them before exiting that same -RCU read-side critical section. -This same prohibition also applies to any lock that is acquired +in the middle of an outermost RCU read-side critical section unless either +(1) it releases them before exiting that same +RCU read-side critical section, or +(2) preemption is disabled across +that entire RCU read-side critical section. +This same prohibition also applies (recursively!) to any lock that is acquired while holding any lock to which this prohibition applies. -Violating this rule results in deadlock. +Adhering to this rule prevents preemptible RCU from invoking +rcu_read_unlock_special() while either runqueue or +priority-inheritance locks are held, thus avoiding deadlock.

    For RCU's part, the preemptible-RCU rcu_read_unlock() -- cgit v1.2.3 From 4b689330b1a5858e88831b3752e9a6692a5c7bdb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 12 Oct 2015 08:51:45 -0700 Subject: documentation: Clarify RCU memory barriers and requirements The RCU requirements do not make it absolutely clear that the memory-barrier requirements are not intended to replace the fundamental requirement that all pre-existing RCU readers complete before a grace period completes. This commit therefore pulls the memory-barrier requirements into a separate section and explicitly calls out the relationship between the memory-barrier requirements and the fundamental requirement. Signed-off-by: Paul E. McKenney --- .../RCU/Design/Requirements/Requirements.html | 66 +++++++++++++--------- .../RCU/Design/Requirements/Requirements.htmlx | 66 +++++++++++++--------- 2 files changed, 78 insertions(+), 54 deletions(-) diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index ab513ed229d7..96cdcf7195d5 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -80,6 +80,8 @@ These are: Grace-Period Guarantee

  • Publish-Subscribe Guarantee +
  • + Memory-Barrier Guarantees
  • RCU Primitives Guaranteed to Execute Unconditionally
  • @@ -499,9 +501,37 @@ might the compiler make use of?
    Answer

    -This simple linked-data-structure scenario clearly demonstrates the need -for RCU's stringent memory-ordering guarantees on systems with more than -one CPU: +In short, RCU's publish-subscribe guarantee is provided by the combination +of rcu_assign_pointer() and rcu_dereference(). +This guarantee allows data elements to be safely added to RCU-protected +linked data structures without disrupting RCU readers. +This guarantee can be used in combination with the grace-period +guarantee to also allow data elements to be removed from RCU-protected +linked data structures, again without disrupting RCU readers. + +

    +This guarantee was only partially premeditated. +DYNIX/ptx used an explicit memory barrier for publication, but had nothing +resembling rcu_dereference() for subscription, nor did it +have anything resembling the smp_read_barrier_depends() +that was later subsumed into rcu_dereference(). +The need for these operations made itself known quite suddenly at a +late-1990s meeting with the DEC Alpha architects, back in the days when +DEC was still a free-standing company. +It took the Alpha architects a good hour to convince me that any sort +of barrier would ever be needed, and it then took me a good two hours +to convince them that their documentation did not make this point clear. +More recent work with the C and C++ standards committees have provided +much education on tricks and traps from the compiler. +In short, compilers were much less tricky in the early 1990s, but in +2015, don't even think about omitting rcu_dereference()! + +

    Memory-Barrier Guarantees

    + +

    +The previous section's simple linked-data-structure scenario clearly +demonstrates the need for RCU's stringent memory-ordering guarantees on +systems with more than one CPU:

    1. Each CPU that has an RCU read-side critical section that @@ -554,30 +584,12 @@ Are all these memory barriers really required?
      Answer

      -In short, RCU's publish-subscribe guarantee is provided by the combination -of rcu_assign_pointer() and rcu_dereference(). -This guarantee allows data elements to be safely added to RCU-protected -linked data structures without disrupting RCU readers. -This guarantee can be used in combination with the grace-period -guarantee to also allow data elements to be removed from RCU-protected -linked data structures, again without disrupting RCU readers. - -

      -This guarantee was only partially premeditated. -DYNIX/ptx used an explicit memory barrier for publication, but had nothing -resembling rcu_dereference() for subscription, nor did it -have anything resembling the smp_read_barrier_depends() -that was later subsumed into rcu_dereference(). -The need for these operations made itself known quite suddenly at a -late-1990s meeting with the DEC Alpha architects, back in the days when -DEC was still a free-standing company. -It took the Alpha architects a good hour to convince me that any sort -of barrier would ever be needed, and it then took me a good two hours -to convince them that their documentation did not make this point clear. -More recent work with the C and C++ standards committees have provided -much education on tricks and traps from the compiler. -In short, compilers were much less tricky in the early 1990s, but in -2015, don't even think about omitting rcu_dereference()! +Note that these memory-barrier requirements do not replace the fundamental +RCU requirement that a grace period wait for all pre-existing readers. +On the contrary, the memory barriers called out in this section must operate in +such a way as to enforce this fundamental requirement. +Of course, different implementations enforce this requirement in different +ways, but enforce it they must.

      RCU Primitives Guaranteed to Execute Unconditionally

      diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx index f7c817f235e0..2d0cd90987f6 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.htmlx +++ b/Documentation/RCU/Design/Requirements/Requirements.htmlx @@ -78,6 +78,8 @@ These are: Grace-Period Guarantee
    2. Publish-Subscribe Guarantee +
    3. + Memory-Barrier Guarantees
    4. RCU Primitives Guaranteed to Execute Unconditionally
    5. @@ -539,9 +541,37 @@ either rcu_access_pointer() or rcu_dereference().

      @@QQE@@

      -This simple linked-data-structure scenario clearly demonstrates the need -for RCU's stringent memory-ordering guarantees on systems with more than -one CPU: +In short, RCU's publish-subscribe guarantee is provided by the combination +of rcu_assign_pointer() and rcu_dereference(). +This guarantee allows data elements to be safely added to RCU-protected +linked data structures without disrupting RCU readers. +This guarantee can be used in combination with the grace-period +guarantee to also allow data elements to be removed from RCU-protected +linked data structures, again without disrupting RCU readers. + +

      +This guarantee was only partially premeditated. +DYNIX/ptx used an explicit memory barrier for publication, but had nothing +resembling rcu_dereference() for subscription, nor did it +have anything resembling the smp_read_barrier_depends() +that was later subsumed into rcu_dereference(). +The need for these operations made itself known quite suddenly at a +late-1990s meeting with the DEC Alpha architects, back in the days when +DEC was still a free-standing company. +It took the Alpha architects a good hour to convince me that any sort +of barrier would ever be needed, and it then took me a good two hours +to convince them that their documentation did not make this point clear. +More recent work with the C and C++ standards committees have provided +much education on tricks and traps from the compiler. +In short, compilers were much less tricky in the early 1990s, but in +2015, don't even think about omitting rcu_dereference()! + +

      Memory-Barrier Guarantees

      + +

      +The previous section's simple linked-data-structure scenario clearly +demonstrates the need for RCU's stringent memory-ordering guarantees on +systems with more than one CPU:

      1. Each CPU that has an RCU read-side critical section that @@ -653,30 +683,12 @@ adhered to the as-if rule than it is to actually adhere to it!

        @@QQE@@

        -In short, RCU's publish-subscribe guarantee is provided by the combination -of rcu_assign_pointer() and rcu_dereference(). -This guarantee allows data elements to be safely added to RCU-protected -linked data structures without disrupting RCU readers. -This guarantee can be used in combination with the grace-period -guarantee to also allow data elements to be removed from RCU-protected -linked data structures, again without disrupting RCU readers. - -

        -This guarantee was only partially premeditated. -DYNIX/ptx used an explicit memory barrier for publication, but had nothing -resembling rcu_dereference() for subscription, nor did it -have anything resembling the smp_read_barrier_depends() -that was later subsumed into rcu_dereference(). -The need for these operations made itself known quite suddenly at a -late-1990s meeting with the DEC Alpha architects, back in the days when -DEC was still a free-standing company. -It took the Alpha architects a good hour to convince me that any sort -of barrier would ever be needed, and it then took me a good two hours -to convince them that their documentation did not make this point clear. -More recent work with the C and C++ standards committees have provided -much education on tricks and traps from the compiler. -In short, compilers were much less tricky in the early 1990s, but in -2015, don't even think about omitting rcu_dereference()! +Note that these memory-barrier requirements do not replace the fundamental +RCU requirement that a grace period wait for all pre-existing readers. +On the contrary, the memory barriers called out in this section must operate in +such a way as to enforce this fundamental requirement. +Of course, different implementations enforce this requirement in different +ways, but enforce it they must.

        RCU Primitives Guaranteed to Execute Unconditionally

        -- cgit v1.2.3 From c64c4b0f9a183e4c73abff848378afa6edf796c5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 6 Nov 2015 23:05:32 -0800 Subject: documentation: Update RCU requirements based on expedited changes Because RCU-sched expedited grace periods now use IPIs and interact with rcu_read_unlock(), it is no longer sufficient to disable preemption across RCU read-side critical sections that acquire and hold scheduler locks. It is now necessary to instead disable interrupts. This commit documents this change. Signed-off-by: Paul E. McKenney --- Documentation/RCU/Design/Requirements/Requirements.html | 12 ++++++++++-- Documentation/RCU/Design/Requirements/Requirements.htmlx | 10 +++++++++- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 96cdcf7195d5..a725f9900ec8 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -1,5 +1,5 @@ - + @@ -1957,7 +1957,7 @@ the runqueue locks and the priority-inheritance locks in the middle of an outermost RCU read-side critical section unless either (1) it releases them before exiting that same RCU read-side critical section, or -(2) preemption is disabled across +(2) interrupts are disabled across that entire RCU read-side critical section. This same prohibition also applies (recursively!) to any lock that is acquired while holding any lock to which this prohibition applies. @@ -1965,6 +1965,14 @@ Adhering to this rule prevents preemptible RCU from invoking rcu_read_unlock_special() while either runqueue or priority-inheritance locks are held, thus avoiding deadlock. +

        +Prior to v4.4, it was only necessary to disable preemption across +RCU read-side critical sections that acquired scheduler locks. +In v4.4, expedited grace periods started using IPIs, and these +IPIs could force a rcu_read_unlock() to take the slowpath. +Therefore, this expedited-grace-period change required disabling of +interrupts, not just preemption. +

        For RCU's part, the preemptible-RCU rcu_read_unlock() implementation must be written carefully to avoid similar deadlocks. diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx index 2d0cd90987f6..3a97ba490c42 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.htmlx +++ b/Documentation/RCU/Design/Requirements/Requirements.htmlx @@ -2124,7 +2124,7 @@ the runqueue locks and the priority-inheritance locks in the middle of an outermost RCU read-side critical section unless either (1) it releases them before exiting that same RCU read-side critical section, or -(2) preemption is disabled across +(2) interrupts are disabled across that entire RCU read-side critical section. This same prohibition also applies (recursively!) to any lock that is acquired while holding any lock to which this prohibition applies. @@ -2132,6 +2132,14 @@ Adhering to this rule prevents preemptible RCU from invoking rcu_read_unlock_special() while either runqueue or priority-inheritance locks are held, thus avoiding deadlock. +

        +Prior to v4.4, it was only necessary to disable preemption across +RCU read-side critical sections that acquired scheduler locks. +In v4.4, expedited grace periods started using IPIs, and these +IPIs could force a rcu_read_unlock() to take the slowpath. +Therefore, this expedited-grace-period change required disabling of +interrupts, not just preemption. +

        For RCU's part, the preemptible-RCU rcu_read_unlock() implementation must be written carefully to avoid similar deadlocks. -- cgit v1.2.3 From f84cfbb0ff269b427a0db591e22ac6948c554ab4 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Mon, 23 Nov 2015 17:04:17 -0500 Subject: Documentation/memory-barriers.txt: Fix ACCESS_ONCE thinko In commit 2ecf810121c7 ("Documentation/memory-barriers.txt: Add needed ACCESS_ONCE() calls to memory-barriers.txt") the statement "Q = P" was converted to "ACCESS_ONCE(Q) = P". This should have been "Q = ACCESS_ONCE(P)". It later became "WRITE_ONCE(Q, P)". This doesn't match the following text, which is "Q = LOAD P". Change the statement to be "Q = READ_ONCE(P)". Signed-off-by: Chris Metcalf Signed-off-by: Paul E. McKenney --- Documentation/memory-barriers.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index aef9487303d0..85304ebd187c 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -194,7 +194,7 @@ There are some minimal guarantees that may be expected of a CPU: (*) On any given CPU, dependent memory accesses will be issued in order, with respect to itself. This means that for: - WRITE_ONCE(Q, P); smp_read_barrier_depends(); D = READ_ONCE(*Q); + Q = READ_ONCE(P); smp_read_barrier_depends(); D = READ_ONCE(*Q); the CPU will issue the following memory operations: @@ -202,9 +202,9 @@ There are some minimal guarantees that may be expected of a CPU: and always in that order. On most systems, smp_read_barrier_depends() does nothing, but it is required for DEC Alpha. The READ_ONCE() - and WRITE_ONCE() are required to prevent compiler mischief. Please - note that you should normally use something like rcu_dereference() - instead of open-coding smp_read_barrier_depends(). + is required to prevent compiler mischief. Please note that you + should normally use something like rcu_dereference() instead of + open-coding smp_read_barrier_depends(). (*) Overlapping loads and stores within a particular CPU will appear to be ordered within that CPU. This means that for: -- cgit v1.2.3 From 83977d273b609477e31af4c993697b75936acde0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Sep 2015 22:11:48 -0700 Subject: rcutorture: Add batch number to script printout Currently, the scripts print "----Start batch" at the beginning of each batch, which does serve as a good visual delimiter between batches. Unfortunately, if there are a lot of batches, it is hard to quickly estimate test runtime from the output of "--dryrun sched". This commit therefore adds a batch number, so that the beginning-of-batch output looks like this "----Start batch 10" for the tenth batch. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- tools/testing/selftests/rcutorture/bin/kvm.sh | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index f6483609ebc2..013c48239a66 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -307,10 +307,10 @@ awk < $T/cfgcpu.pack \ } # Dump out the scripting required to run one test batch. -function dump(first, pastlast) +function dump(first, pastlast, batchnum) { - print "echo ----Start batch: `date`"; - print "echo ----Start batch: `date` >> " rd "/log"; + print "echo ----Start batch " batchnum ": `date`"; + print "echo ----Start batch " batchnum ": `date` >> " rd "/log"; jn=1 for (j = first; j < pastlast; j++) { builddir=KVM "/b" jn @@ -371,25 +371,28 @@ END { njobs = i; nc = ncpus; first = 0; + batchnum = 1; # Each pass through the following loop considers one test. for (i = 0; i < njobs; i++) { if (ncpus == 0) { # Sequential test specified, each test its own batch. - dump(i, i + 1); + dump(i, i + 1, batchnum); first = i; + batchnum++; } else if (nc < cpus[i] && i != 0) { # Out of CPUs, dump out a batch. - dump(first, i); + dump(first, i, batchnum); first = i; nc = ncpus; + batchnum++; } # Account for the CPUs needed by the current test. nc -= cpus[i]; } # Dump the last batch. if (ncpus != 0) - dump(first, i); + dump(first, i, batchnum); }' >> $T/script cat << ___EOF___ >> $T/script -- cgit v1.2.3 From a0e3a3aa2841d5720a277de53b6882eb8b2ef698 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 5 Dec 2015 17:34:10 -0800 Subject: rcutorture: Flag nonexistent RCU GP kthread Currently, if the RCU grace-period kthread has not yet been created, in which case the starvation-check code will print zero for the state, which maps to TASK_RUNNING. This could clearly be quite confusing, so this commit prints ~0, which does not map to any legal ->state value. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 81aa1cdc6bc9..e2315fb57b23 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1201,7 +1201,7 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) rsp->name, j - gpa, rsp->gpnum, rsp->completed, rsp->gp_flags, rsp->gp_state, - rsp->gp_kthread ? rsp->gp_kthread->state : 0); + rsp->gp_kthread ? rsp->gp_kthread->state : ~0); } /* -- cgit v1.2.3 From b1adb3e2736b695821badc715d2c7a5d873b8b94 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Oct 2015 10:38:16 -0700 Subject: rcutorture: Dump stack when GP kthread stalls This commit increases debug information in the case where the grace-period kthread is being prevented from running by dumping that kthread's stack. Signed-off-by: Paul E. McKenney [ paulmck: Split into prior commit and this commit, as suggested by Josh Triplett. ] Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e2315fb57b23..7b78c88e19a3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1196,12 +1196,15 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) j = jiffies; gpa = READ_ONCE(rsp->gp_activity); - if (j - gpa > 2 * HZ) + if (j - gpa > 2 * HZ) { pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n", rsp->name, j - gpa, rsp->gpnum, rsp->completed, rsp->gp_flags, rsp->gp_state, rsp->gp_kthread ? rsp->gp_kthread->state : ~0); + if (rsp->gp_kthread) + sched_show_task(rsp->gp_kthread); + } } /* -- cgit v1.2.3 From 542e83329db44622a401b74b4be0ea2d5f0850be Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Oct 2015 11:41:06 -0700 Subject: rcutorture: Default grace period to three minutes, allow override The default test grace period of two minutes is insufficient in some cases and excessive in others. This commit therefore increases the default to three minutes, but also adds a --shutdown-grace parameter to allow the default to be overridden. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 4 +--- tools/testing/selftests/rcutorture/bin/kvm.sh | 7 +++++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 5236e073919d..d39273dae464 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -38,8 +38,6 @@ # # Authors: Paul E. McKenney -grace=120 - T=/tmp/kvm-test-1-run.sh.$$ trap 'rm -rf $T' 0 touch $T @@ -214,7 +212,7 @@ then else break fi - if test $kruntime -ge $((seconds + grace)) + if test $kruntime -ge $((seconds + $TORTURE_SHUTDOWN_GRACE)) then echo "!!! PID $qemu_pid hung at $kruntime vs. $seconds seconds" >> $resdir/Warnings 2>&1 kill -KILL $qemu_pid diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 013c48239a66..4a431767f77a 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -42,6 +42,7 @@ TORTURE_DEFCONFIG=defconfig TORTURE_BOOT_IMAGE="" TORTURE_INITRD="$KVM/initrd"; export TORTURE_INITRD TORTURE_KMAKE_ARG="" +TORTURE_SHUTDOWN_GRACE=180 TORTURE_SUITE=rcu resdir="" configs="" @@ -149,6 +150,11 @@ do resdir=$2 shift ;; + --shutdown-grace) + checkarg --shutdown-grace "(seconds)" "$#" "$2" '^[0-9]*$' '^error' + TORTURE_SHUTDOWN_GRACE=$2 + shift + ;; --torture) checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\)$' '^--' TORTURE_SUITE=$2 @@ -266,6 +272,7 @@ TORTURE_KMAKE_ARG="$TORTURE_KMAKE_ARG"; export TORTURE_KMAKE_ARG TORTURE_QEMU_CMD="$TORTURE_QEMU_CMD"; export TORTURE_QEMU_CMD TORTURE_QEMU_INTERACTIVE="$TORTURE_QEMU_INTERACTIVE"; export TORTURE_QEMU_INTERACTIVE TORTURE_QEMU_MAC="$TORTURE_QEMU_MAC"; export TORTURE_QEMU_MAC +TORTURE_SHUTDOWN_GRACE="$TORTURE_SHUTDOWN_GRACE"; export TORTURE_SHUTDOWN_GRACE TORTURE_SUITE="$TORTURE_SUITE"; export TORTURE_SUITE if ! test -e $resdir then -- cgit v1.2.3 From 91bf6a83e5a121c9313ae47156dd47df46ea2aac Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 15 Oct 2015 16:10:07 -0700 Subject: rcutorture: Remove CONFIG_RCU_USER_QS from rcutorture selftest doc Commit d1ec4c34c7a9 ("rcu: Drop RCU_USER_QS in favor of NO_HZ_FULL") has removed RCU_USER_QS from Kconfig file, so remove it from some documents to avoid any confusion. Signed-off-by: Yang Shi Signed-off-by: Paul E. McKenney Acked-by: Frederic Weisbecker Reviewed-by: Josh Triplett --- tools/testing/selftests/rcutorture/doc/TINY_RCU.txt | 1 - tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt | 4 ---- 2 files changed, 5 deletions(-) diff --git a/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt b/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt index 9ef33a743b73..24396ae8355b 100644 --- a/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt +++ b/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt @@ -20,7 +20,6 @@ CONFIG_PROVE_RCU CONFIG_NO_HZ_FULL_SYSIDLE CONFIG_RCU_NOCB_CPU -CONFIG_RCU_USER_QS Meaningless for TINY_RCU. diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt index 657f3a035488..4e2b1893d40d 100644 --- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt +++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt @@ -72,10 +72,6 @@ CONFIG_RCU_TORTURE_TEST_RUNNABLE Always used in KVM testing. -CONFIG_RCU_USER_QS - - Redundant with CONFIG_NO_HZ_FULL. - CONFIG_PREEMPT_RCU CONFIG_TREE_RCU -- cgit v1.2.3 From 18aff33e7314253b9437234bd6d69ddc4827de70 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Nov 2015 13:35:28 -0800 Subject: rcutorture: Print symbolic name for rcu_torture_writer_state Currently, rcu_torture_writer_state is printed as an integer, which slows debugging. This commit therefore prints a symbolic name in addition to the integer. Signed-off-by: Paul E. McKenney [ paulmck: More "const", as suggested by Josh Triplett. ] Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d89328e260df..d2988d047d66 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -162,6 +162,27 @@ static int rcu_torture_writer_state; #define RTWS_SYNC 7 #define RTWS_STUTTER 8 #define RTWS_STOPPING 9 +static const char * const rcu_torture_writer_state_names[] = { + "RTWS_FIXED_DELAY", + "RTWS_DELAY", + "RTWS_REPLACE", + "RTWS_DEF_FREE", + "RTWS_EXP_SYNC", + "RTWS_COND_GET", + "RTWS_COND_SYNC", + "RTWS_SYNC", + "RTWS_STUTTER", + "RTWS_STOPPING", +}; + +static const char *rcu_torture_writer_state_getname(void) +{ + unsigned int i = READ_ONCE(rcu_torture_writer_state); + + if (i >= ARRAY_SIZE(rcu_torture_writer_state_names)) + return "???"; + return rcu_torture_writer_state_names[i]; +} #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) #define RCUTORTURE_RUNNABLE_INIT 1 @@ -1307,7 +1328,8 @@ rcu_torture_stats_print(void) rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed); - pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n", + pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x\n", + rcu_torture_writer_state_getname(), rcu_torture_writer_state, gpnum, completed, flags); show_rcu_gp_kthreads(); -- cgit v1.2.3 From 6b50e119c440b7532ed749b635a58b3839f62992 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Nov 2015 14:39:26 -0800 Subject: rcutorture: Print symbolic name for ->gp_state Currently, ->gp_state is printed as an integer, which slows debugging. This commit therefore prints a symbolic name in addition to the integer. Signed-off-by: Paul E. McKenney [ paulmck: Updated to fix relational operator called out by Dan Carpenter. ] [ paulmck: More "const", as suggested by Josh Triplett. ] Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 15 +++++++++++++-- kernel/rcu/tree.h | 12 ++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7b78c88e19a3..316354109734 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1186,6 +1186,16 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) rsp->n_force_qs_gpstart = READ_ONCE(rsp->n_force_qs); } +/* + * Convert a ->gp_state value to a character string. + */ +static const char *gp_state_getname(short gs) +{ + if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) + return "???"; + return gp_state_names[gs]; +} + /* * Complain about starvation of grace-period kthread. */ @@ -1197,10 +1207,11 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) j = jiffies; gpa = READ_ONCE(rsp->gp_activity); if (j - gpa > 2 * HZ) { - pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n", + pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n", rsp->name, j - gpa, rsp->gpnum, rsp->completed, - rsp->gp_flags, rsp->gp_state, + rsp->gp_flags, + gp_state_getname(rsp->gp_state), rsp->gp_state, rsp->gp_kthread ? rsp->gp_kthread->state : ~0); if (rsp->gp_kthread) sched_show_task(rsp->gp_kthread); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f32bebb6bc90..a3fb6fe94127 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -545,6 +545,18 @@ struct rcu_state { #define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */ #define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */ +#ifndef RCU_TREE_NONCORE +static const char * const gp_state_names[] = { + "RCU_GP_IDLE", + "RCU_GP_WAIT_GPS", + "RCU_GP_DONE_GPS", + "RCU_GP_WAIT_FQS", + "RCU_GP_DOING_FQS", + "RCU_GP_CLEANUP", + "RCU_GP_CLEANED", +}; +#endif /* #ifndef RCU_TREE_NONCORE */ + extern struct list_head rcu_struct_flavors; /* Sequence through rcu_state structures for each RCU flavor. */ -- cgit v1.2.3 From 5708c6475789ac5f58ff620e78bd08ca2caa1f23 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Dec 2015 12:37:57 -0800 Subject: torture: Abbreviate console error dump Currently, the scripts print a list of warning/bug indicators from the console.log file. This works well if there are only a few warnings or bugs, but can be quite annoying if there is a large number. This commit therefore prints a summary listing the number of each type of warning/bug indicator, but only if there is at least one such indicator. The full list is stored in the results directory at console.log.diags, which makes it easier to find the warning/bugs in the full console.log. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- .../selftests/rcutorture/bin/parse-console.sh | 41 ++++++++++++++++++---- 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh index d8f35cf116be..844787a0d7be 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-console.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh @@ -24,9 +24,6 @@ # # Authors: Paul E. McKenney -T=/tmp/abat-chk-badness.sh.$$ -trap 'rm -f $T' 0 - file="$1" title="$2" @@ -36,9 +33,41 @@ if grep -Pq '\x00' < $file then print_warning Console output contains nul bytes, old qemu still running? fi -egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|Stall ended before state dump start' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $T -if test -s $T +egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|Stall ended before state dump start' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $1.diags +if test -s $1.diags then print_warning Assertion failure in $file $title - cat $T + # cat $1.diags + summary="" + n_badness=`grep -c Badness $1` + if test "$n_badness" -ne 0 + then + summary="$summary Badness: $n_badness" + fi + n_warn=`grep -v 'Warning: unable to open an initial console' $1 | egrep -c 'WARNING:|Warn'` + if test "$n_warn" -ne 0 + then + summary="$summary Warnings: $n_warn" + fi + n_bugs=`egrep -c 'BUG|Oops:' $1` + if test "$n_bugs" -ne 0 + then + summary="$summary Bugs: $n_bugs" + fi + n_calltrace=`grep -c 'Call Trace:' $1` + if test "$n_calltrace" -ne 0 + then + summary="$summary Call Traces: $n_calltrace" + fi + n_lockdep=`grep -c =========== $1` + if test "$n_badness" -ne 0 + then + summary="$summary lockdep: $n_badness" + fi + n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|Stall ended before state dump start' $1` + if test "$n_stalls" -ne 0 + then + summary="$summary Stalls: $n_stalls" + fi + print_warning Summary: $summary fi -- cgit v1.2.3 From c979ff991764a2e620db0b1bfb0a105b9cf78b6a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Dec 2015 15:53:11 -0800 Subject: torture: Place console.log files correctly from the get-go Currently, the console output files ("console.log") are placed in the build directory initially, then copied to the results directory. One problem with this is if a qemu refuses to die in a timely fashion after a kernel hang, it will continue to write after the next qemu starts up, resulting in confusing output from the old instance of qemu. This commit prevents such confusion by placing the console.log files into the results directory to begin with, so that a given instance of qemu is always writing only to its own console.log file. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index d39273dae464..0f80eefb0bfd 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -150,7 +150,7 @@ fi qemu_args="`specify_qemu_cpus "$QEMU" "$qemu_args" "$cpu_count"`" # Generate architecture-specific and interaction-specific qemu arguments -qemu_args="$qemu_args `identify_qemu_args "$QEMU" "$builddir/console.log"`" +qemu_args="$qemu_args `identify_qemu_args "$QEMU" "$resdir/console.log"`" # Generate qemu -append arguments qemu_append="`identify_qemu_append "$QEMU"`" @@ -166,7 +166,7 @@ then touch $resdir/buildonly exit 0 fi -echo "NOTE: $QEMU either did not run or was interactive" > $builddir/console.log +echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log echo $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd ( $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append "$qemu_append $boot_args"; echo $? > $resdir/qemu-retval ) & qemu_pid=$! @@ -222,6 +222,5 @@ then done fi -cp $builddir/console.log $resdir parse-torture.sh $resdir/console.log $title parse-console.sh $resdir/console.log $title -- cgit v1.2.3 From 79cfea0273876d9c438f3227b8f68c8c7ae31583 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 7 Dec 2015 13:09:52 -0800 Subject: rcu: Remove TINY_RCU bloat from pointless boot parameters The rcu_expedited, rcu_normal, and rcu_normal_after_boot kernel boot parameters are pointless in the case of TINY_RCU because in that case synchronous grace periods, both expedited and normal, are no-ops. However, these three symbols contribute several hundred bytes of bloat. This commit therefore uses CPP directives to avoid compiling this code in TINY_RCU kernels. Reported-by: kbuild test robot Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- Documentation/kernel-parameters.txt | 13 ++++++++----- include/linux/rcupdate.h | 9 ++++++++- kernel/ksysfs.c | 4 ++++ kernel/rcu/update.c | 7 ++++--- 4 files changed, 24 insertions(+), 9 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 197305bbb9b7..d8186da15ca1 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3308,20 +3308,23 @@ bytes respectively. Such letter suffixes can also be entirely omitted. of synchronize_rcu(). This reduces latency, but can increase CPU utilization, degrade real-time latency, and degrade energy efficiency. + No effect on CONFIG_TINY_RCU kernels. rcupdate.rcu_normal= [KNL] Use only normal grace-period primitives, for example, synchronize_rcu() instead of synchronize_rcu_expedited(). This improves - real-time latency, CPU utilization, and energy - efficiency, but can expose users to increased - grace-period latency. This parameter overrides - rcupdate.rcu_expedited. + real-time latency, CPU utilization, and + energy efficiency, but can expose users to + increased grace-period latency. This parameter + overrides rcupdate.rcu_expedited. No effect on + CONFIG_TINY_RCU kernels. rcupdate.rcu_normal_after_boot= [KNL] Once boot has completed (that is, after rcu_end_inkernel_boot() has been invoked), use - only normal grace-period primitives. + only normal grace-period primitives. No effect + on CONFIG_TINY_RCU kernels. rcupdate.rcu_task_stall_timeout= [KNL] Set timeout in jiffies for RCU task stall warning diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 98d9f30c02d4..47e95b80bebd 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -48,8 +48,10 @@ #include +#ifndef CONFIG_TINY_RCU extern int rcu_expedited; /* for sysctl */ extern int rcu_normal; /* also for sysctl */ +#endif /* #ifndef CONFIG_TINY_RCU */ #ifdef CONFIG_TINY_RCU /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ @@ -327,7 +329,6 @@ static inline int rcu_preempt_depth(void) /* Internal to kernel */ void rcu_init(void); -void rcu_end_inkernel_boot(void); void rcu_sched_qs(void); void rcu_bh_qs(void); void rcu_check_callbacks(int user); @@ -335,6 +336,12 @@ struct notifier_block; int rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu); +#ifndef CONFIG_TINY_RCU +void rcu_end_inkernel_boot(void); +#else /* #ifndef CONFIG_TINY_RCU */ +static inline void rcu_end_inkernel_boot(void) { } +#endif /* #ifndef CONFIG_TINY_RCU */ + #ifdef CONFIG_RCU_STALL_COMMON void rcu_sysrq_start(void); void rcu_sysrq_end(void); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index b4e2fa52d8bc..152da4a48867 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -144,6 +144,7 @@ static ssize_t fscaps_show(struct kobject *kobj, } KERNEL_ATTR_RO(fscaps); +#ifndef CONFIG_TINY_RCU int rcu_expedited; static ssize_t rcu_expedited_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -177,6 +178,7 @@ static ssize_t rcu_normal_store(struct kobject *kobj, return count; } KERNEL_ATTR_RW(rcu_normal); +#endif /* #ifndef CONFIG_TINY_RCU */ /* * Make /sys/kernel/notes give the raw contents of our kernel .notes section. @@ -219,8 +221,10 @@ static struct attribute * kernel_attrs[] = { &kexec_crash_size_attr.attr, &vmcoreinfo_attr.attr, #endif +#ifndef CONFIG_TINY_RCU &rcu_expedited_attr.attr, &rcu_normal_attr.attr, +#endif NULL }; diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 12b91f5a60a6..76b94e19430b 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -60,11 +60,12 @@ MODULE_ALIAS("rcupdate"); #endif #define MODULE_PARAM_PREFIX "rcupdate." +#ifndef CONFIG_TINY_RCU module_param(rcu_expedited, int, 0); module_param(rcu_normal, int, 0); - static int rcu_normal_after_boot; module_param(rcu_normal_after_boot, int, 0); +#endif /* #ifndef CONFIG_TINY_RCU */ #if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT) /** @@ -172,8 +173,6 @@ void rcu_unexpedite_gp(void) } EXPORT_SYMBOL_GPL(rcu_unexpedite_gp); -#endif /* #ifndef CONFIG_TINY_RCU */ - /* * Inform RCU of the end of the in-kernel boot sequence. */ @@ -185,6 +184,8 @@ void rcu_end_inkernel_boot(void) WRITE_ONCE(rcu_normal, 1); } +#endif /* #ifndef CONFIG_TINY_RCU */ + #ifdef CONFIG_PREEMPT_RCU /* -- cgit v1.2.3 From a87f203e2731ab477386c678e59033ee103018c0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 20 Oct 2015 12:38:49 -0700 Subject: rcu: Eliminate unused rcu_init_one() argument Now that the rcu_state structure's ->rda field is compile-time initialized, there is no need to pass the per-CPU rcu_data structure into rcu_init_one(). This commit therefore eliminates this now-unused parameter. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 7 +++---- kernel/rcu/tree_plugin.h | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 93941d3434ad..9a4c8c0653ff 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4351,8 +4351,7 @@ static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt) /* * Helper function for rcu_init() that initializes one rcu_state structure. */ -static void __init rcu_init_one(struct rcu_state *rsp, - struct rcu_data __percpu *rda) +static void __init rcu_init_one(struct rcu_state *rsp) { static const char * const buf[] = RCU_NODE_NAME_INIT; static const char * const fqs[] = RCU_FQS_NAME_INIT; @@ -4545,8 +4544,8 @@ void __init rcu_init(void) rcu_bootup_announce(); rcu_init_geometry(); - rcu_init_one(&rcu_bh_state, &rcu_bh_data); - rcu_init_one(&rcu_sched_state, &rcu_sched_data); + rcu_init_one(&rcu_bh_state); + rcu_init_one(&rcu_sched_state); if (dump_tree) rcu_dump_rcu_node_tree(&rcu_sched_state); __rcu_init_preempt(); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e6da888cc908..fccef5d4b198 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -777,7 +777,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier); */ static void __init __rcu_init_preempt(void) { - rcu_init_one(rcu_state_p, rcu_data_p); + rcu_init_one(rcu_state_p); } /* -- cgit v1.2.3 From d117c8aa1d511f76401337620b9c4ffb4c886579 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 31 Oct 2015 00:01:18 -0700 Subject: rcu: Make cpu_needs_another_gp() be bool The cpu_needs_another_gp() function is currently of type int, but only returns zero or one. Bow to reality and make it be of type bool. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9a4c8c0653ff..d6863bceeb45 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -597,25 +597,25 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) * The caller must have disabled interrupts to prevent races with * normal callback registry. */ -static int +static bool cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { int i; if (rcu_gp_in_progress(rsp)) - return 0; /* No, a grace period is already in progress. */ + return false; /* No, a grace period is already in progress. */ if (rcu_future_needs_gp(rsp)) - return 1; /* Yes, a no-CBs CPU needs one. */ + return true; /* Yes, a no-CBs CPU needs one. */ if (!rdp->nxttail[RCU_NEXT_TAIL]) - return 0; /* No, this is a no-CBs (or offline) CPU. */ + return false; /* No, this is a no-CBs (or offline) CPU. */ if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) - return 1; /* Yes, this CPU has newly registered callbacks. */ + return true; /* Yes, CPU has newly registered callbacks. */ for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) if (rdp->nxttail[i - 1] != rdp->nxttail[i] && ULONG_CMP_LT(READ_ONCE(rsp->completed), rdp->nxtcompleted[i])) - return 1; /* Yes, CBs for future grace period. */ - return 0; /* No grace period needed. */ + return true; /* Yes, CBs for future grace period. */ + return false; /* No grace period needed. */ } /* -- cgit v1.2.3 From 7c9906ca5e582a773fff696975e312cef58a7386 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 31 Oct 2015 00:59:01 -0700 Subject: rcu: Don't redundantly disable irqs in rcu_irq_{enter,exit}() This commit replaces a local_irq_save()/local_irq_restore() pair with a lockdep assertion that interrupts are already disabled. This should remove the corresponding overhead from the interrupt entry/exit fastpaths. This change was inspired by the fact that Iftekhar Ahmed's mutation testing showed that removing rcu_irq_enter()'s call to local_ird_restore() had no effect, which might indicate that interrupts were always enabled anyway. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 4 ++-- include/linux/rcutiny.h | 8 ++++++++ include/linux/rcutree.h | 2 ++ include/linux/tracepoint.h | 4 ++-- kernel/rcu/tree.c | 32 ++++++++++++++++++++++++++------ 5 files changed, 40 insertions(+), 10 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index a0189ba67fde..f2b667df1131 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -379,9 +379,9 @@ static inline void rcu_init_nohz(void) */ #define RCU_NONIDLE(a) \ do { \ - rcu_irq_enter(); \ + rcu_irq_enter_irqson(); \ do { a; } while (0); \ - rcu_irq_exit(); \ + rcu_irq_exit_irqson(); \ } while (0) /* diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 4c1aaf9cce7b..64809aea661c 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -181,6 +181,14 @@ static inline void rcu_irq_enter(void) { } +static inline void rcu_irq_exit_irqson(void) +{ +} + +static inline void rcu_irq_enter_irqson(void) +{ +} + static inline void rcu_irq_exit(void) { } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 9d3eda39bcd2..ad1eda9fa4da 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -97,6 +97,8 @@ void rcu_idle_enter(void); void rcu_idle_exit(void); void rcu_irq_enter(void); void rcu_irq_exit(void); +void rcu_irq_enter_irqson(void); +void rcu_irq_exit_irqson(void); void exit_rcu(void); diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 696a339c592c..7834a8a8bf1e 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -171,8 +171,8 @@ extern void syscall_unregfunc(void); TP_PROTO(data_proto), \ TP_ARGS(data_args), \ TP_CONDITION(cond), \ - rcu_irq_enter(), \ - rcu_irq_exit()); \ + rcu_irq_enter_irqson(), \ + rcu_irq_exit_irqson()); \ } #else #define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d6863bceeb45..40940b0d0310 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -732,7 +732,7 @@ void rcu_user_enter(void) * * Exit from an interrupt handler, which might possibly result in entering * idle mode, in other words, leaving the mode in which read-side critical - * sections can occur. + * sections can occur. The caller must have disabled interrupts. * * This code assumes that the idle loop never does anything that might * result in unbalanced calls to irq_enter() and irq_exit(). If your @@ -745,11 +745,10 @@ void rcu_user_enter(void) */ void rcu_irq_exit(void) { - unsigned long flags; long long oldval; struct rcu_dynticks *rdtp; - local_irq_save(flags); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting--; @@ -760,6 +759,17 @@ void rcu_irq_exit(void) else rcu_eqs_enter_common(oldval, true); rcu_sysidle_enter(1); +} + +/* + * Wrapper for rcu_irq_exit() where interrupts are enabled. + */ +void rcu_irq_exit_irqson(void) +{ + unsigned long flags; + + local_irq_save(flags); + rcu_irq_exit(); local_irq_restore(flags); } @@ -857,7 +867,7 @@ void rcu_user_exit(void) * * Enter an interrupt handler, which might possibly result in exiting * idle mode, in other words, entering the mode in which read-side critical - * sections can occur. + * sections can occur. The caller must have disabled interrupts. * * Note that the Linux kernel is fully capable of entering an interrupt * handler that it never exits, for example when doing upcalls to @@ -873,11 +883,10 @@ void rcu_user_exit(void) */ void rcu_irq_enter(void) { - unsigned long flags; struct rcu_dynticks *rdtp; long long oldval; - local_irq_save(flags); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting++; @@ -888,6 +897,17 @@ void rcu_irq_enter(void) else rcu_eqs_exit_common(oldval, true); rcu_sysidle_exit(1); +} + +/* + * Wrapper for rcu_irq_enter() where interrupts are enabled. + */ +void rcu_irq_enter_irqson(void) +{ + unsigned long flags; + + local_irq_save(flags); + rcu_irq_enter(); local_irq_restore(flags); } -- cgit v1.2.3 From f039f0af081746933d5dec3229637a18fab791ed Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 2 Nov 2015 13:21:47 +1100 Subject: rcu: Fix comment for rcu_dereference_raw_notrace rcu_dereference_raw() calls indirectly rcu_read_lock_held() while rcu_dereference_raw_notrace() does not so fix the comment about the latter. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index f2b667df1131..85aabcd8b564 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -741,7 +741,7 @@ static inline void rcu_preempt_sleep_check(void) * The tracing infrastructure traces RCU (we want that), but unfortunately * some of the RCU checks causes tracing to lock up the system. * - * The tracing version of rcu_dereference_raw() must not call + * The no-tracing version of rcu_dereference_raw() must not call * rcu_read_lock_held(). */ #define rcu_dereference_raw_notrace(p) __rcu_dereference_check((p), 1, __rcu) -- cgit v1.2.3 From e11f13355b09df970495c45ed0eac1dc85dcf5c1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 4 Nov 2015 08:22:05 -0800 Subject: rcu: Move wakeup out from under rnp->lock This patch removes a potential deadlock hazard by moving the wake_up_process() in rcu_spawn_gp_kthread() out from under rnp->lock. Signed-off-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 40940b0d0310..87b604d0b0d2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4319,8 +4319,8 @@ static int __init rcu_spawn_gp_kthread(void) sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); } - wake_up_process(t); raw_spin_unlock_irqrestore(&rnp->lock, flags); + wake_up_process(t); } rcu_spawn_nocb_kthreads(); rcu_spawn_boost_kthreads(); -- cgit v1.2.3 From 45fed3e7cfb4001c80cd4bd25249d194a52bfed3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 7 Nov 2015 23:35:00 -0800 Subject: rcu: Make rcu_gp_init() be bool rather than int The return value from rcu_gp_init() is always used as a bool, so this commit makes it be a bool. Reported-by: Iftekhar Ahmed Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 87b604d0b0d2..01a90a3bdf79 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1814,9 +1814,9 @@ static void rcu_gp_slow(struct rcu_state *rsp, int delay) } /* - * Initialize a new grace period. Return 0 if no grace period required. + * Initialize a new grace period. Return false if no grace period required. */ -static int rcu_gp_init(struct rcu_state *rsp) +static bool rcu_gp_init(struct rcu_state *rsp) { unsigned long oldmask; struct rcu_data *rdp; @@ -1827,7 +1827,7 @@ static int rcu_gp_init(struct rcu_state *rsp) if (!READ_ONCE(rsp->gp_flags)) { /* Spurious wakeup, tell caller to go back to sleep. */ raw_spin_unlock_irq(&rnp->lock); - return 0; + return false; } WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */ @@ -1837,7 +1837,7 @@ static int rcu_gp_init(struct rcu_state *rsp) * Not supposed to be able to happen. */ raw_spin_unlock_irq(&rnp->lock); - return 0; + return false; } /* Advance to a new grace period and initialize state. */ @@ -1929,7 +1929,7 @@ static int rcu_gp_init(struct rcu_state *rsp) WRITE_ONCE(rsp->gp_activity, jiffies); } - return 1; + return true; } /* -- cgit v1.2.3 From 69b907297f4edf13182e3fa3adc0160df077746c Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Sat, 5 Dec 2015 18:14:19 -0800 Subject: list: Add lockless list traversal primitives Although list_for_each_entry_rcu() can in theory be used anywhere preemption is disabled, it can result in calls to lockdep, which cannot be used in certain constrained execution environments, such as exception handlers that do not map the entire kernel into their address spaces. This commit therefore adds list_entry_lockless() and list_for_each_entry_lockless(), which never invoke lockdep and can therefore safely be used from these constrained environments, but only as long as those environments are non-preemptible (or items are never deleted from the list). Use synchronize_sched(), call_rcu_sched(), or synchronize_sched_expedited() in updates for the needed grace periods. Of course, if items are never deleted from the list, there is no need to wait for grace periods. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Paul E. McKenney --- include/linux/rculist.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 5ed540986019..1fad79861e14 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -304,6 +304,42 @@ static inline void list_splice_init_rcu(struct list_head *list, &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) +/** + * list_entry_lockless - get the struct for this entry + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_head within the struct. + * + * This primitive may safely run concurrently with the _rcu list-mutation + * primitives such as list_add_rcu(), but requires some implicit RCU + * read-side guarding. One example is running within a special + * exception-time environment where preemption is disabled and where + * lockdep cannot be invoked (in which case updaters must use RCU-sched, + * as in synchronize_sched(), call_rcu_sched(), and friends). Another + * example is when items are added to the list, but never deleted. + */ +#define list_entry_lockless(ptr, type, member) \ + container_of((typeof(ptr))lockless_dereference(ptr), type, member) + +/** + * list_for_each_entry_lockless - iterate over rcu list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + * + * This primitive may safely run concurrently with the _rcu list-mutation + * primitives such as list_add_rcu(), but requires some implicit RCU + * read-side guarding. One example is running within a special + * exception-time environment where preemption is disabled and where + * lockdep cannot be invoked (in which case updaters must use RCU-sched, + * as in synchronize_sched(), call_rcu_sched(), and friends). Another + * example is when items are added to the list, but never deleted. + */ +#define list_for_each_entry_lockless(pos, head, member) \ + for (pos = list_entry_lockless((head)->next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry_lockless(pos->member.next, typeof(*pos), member)) + /** * list_for_each_entry_continue_rcu - continue iteration over list of given type * @pos: the type * to use as a loop cursor. -- cgit v1.2.3 From 984cf355aeaa8f2eda3861b50d0e8d3e3f77e83b Mon Sep 17 00:00:00 2001 From: Ani Sinha Date: Thu, 17 Dec 2015 17:15:10 -0800 Subject: sysrq: Fix warning in sysrq generated crash. Commit 984d74a72076a1 ("sysrq: rcu-ify __handle_sysrq") replaced spin_lock_irqsave() calls with rcu_read_lock() calls in sysrq. Since rcu_read_lock() does not disable preemption, faulthandler_disabled() in __do_page_fault() in x86/fault.c returns false. When the code later calls might_sleep() in the pagefault handler, we get the following warning: BUG: sleeping function called from invalid context at ../arch/x86/mm/fault.c:1187 in_atomic(): 0, irqs_disabled(): 0, pid: 4706, name: bash Preemption disabled at:[] printk+0x48/0x4a To fix this, we release the RCU read lock before we crash. Tested this patch on linux 3.18 by booting off one of our boards. Fixes: 984d74a72076a1 ("sysrq: rcu-ify __handle_sysrq") Signed-off-by: Ani Sinha Reviewed-by: Rik van Riel Signed-off-by: Paul E. McKenney --- drivers/tty/sysrq.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 5381a728d23e..e5139402e7f8 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -133,6 +133,12 @@ static void sysrq_handle_crash(int key) { char *killer = NULL; + /* we need to release the RCU read lock here, + * otherwise we get an annoying + * 'BUG: sleeping function called from invalid context' + * complaint from the kernel before the panic. + */ + rcu_read_unlock(); panic_on_oops = 1; /* force panic */ wmb(); *killer = 1; -- cgit v1.2.3