From aa123a748ea552b18f0d4add823c29ddbddaf7b4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 11 Apr 2017 09:17:08 -0700 Subject: doc: Update RCU data-structure documentation for rcu_segcblist The rcu_segcblist data structure, which contains segmented lists of RCU callbacks, was recently added. This commit updates the documentation accordingly. Signed-off-by: Paul E. McKenney --- .../Design/Data-Structures/Data-Structures.html | 207 ++++++++++++++------- 1 file changed, 144 insertions(+), 63 deletions(-) (limited to 'Documentation/RCU/Design/Data-Structures/Data-Structures.html') diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html index d583c653a703..2ab38ee420c5 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html @@ -19,6 +19,8 @@ to each other. The rcu_state Structure
  • The rcu_node Structure +
  • + The rcu_segcblist Structure
  • The rcu_data Structure
  • @@ -841,6 +843,134 @@ for lockdep lock-class names. Finally, lines 64-66 produce an error if the maximum number of CPUs is too large for the specified fanout. +

    +The rcu_segcblist Structure

    + +The rcu_segcblist structure maintains a segmented list of +callbacks as follows: + +
    + 1 #define RCU_DONE_TAIL        0
    + 2 #define RCU_WAIT_TAIL        1
    + 3 #define RCU_NEXT_READY_TAIL  2
    + 4 #define RCU_NEXT_TAIL        3
    + 5 #define RCU_CBLIST_NSEGS     4
    + 6
    + 7 struct rcu_segcblist {
    + 8   struct rcu_head *head;
    + 9   struct rcu_head **tails[RCU_CBLIST_NSEGS];
    +10   unsigned long gp_seq[RCU_CBLIST_NSEGS];
    +11   long len;
    +12   long len_lazy;
    +13 };
    +
    + +

    +The segments are as follows: + +

      +
    1. RCU_DONE_TAIL: Callbacks whose grace periods have elapsed. + These callbacks are ready to be invoked. +
    2. RCU_WAIT_TAIL: Callbacks that are waiting for the + current grace period. + Note that different CPUs can have different ideas about which + grace period is current, hence the ->gp_seq field. +
    3. RCU_NEXT_READY_TAIL: Callbacks waiting for the next + grace period to start. +
    4. RCU_NEXT_TAIL: Callbacks that have not yet been + associated with a grace period. +
    + +

    +The ->head pointer references the first callback or +is NULL if the list contains no callbacks (which is +not the same as being empty). +Each element of the ->tails[] array references the +->next pointer of the last callback in the corresponding +segment of the list, or the list's ->head pointer if +that segment and all previous segments are empty. +If the corresponding segment is empty but some previous segment is +not empty, then the array element is identical to its predecessor. +Older callbacks are closer to the head of the list, and new callbacks +are added at the tail. +This relationship between the ->head pointer, the +->tails[] array, and the callbacks is shown in this +diagram: + +

    nxtlist.svg + +

    In this figure, the ->head pointer references the +first +RCU callback in the list. +The ->tails[RCU_DONE_TAIL] array element references +the ->head pointer itself, indicating that none +of the callbacks is ready to invoke. +The ->tails[RCU_WAIT_TAIL] array element references callback +CB 2's ->next pointer, which indicates that +CB 1 and CB 2 are both waiting on the current grace period, +give or take possible disagreements about exactly which grace period +is the current one. +The ->tails[RCU_NEXT_READY_TAIL] array element +references the same RCU callback that ->tails[RCU_WAIT_TAIL] +does, which indicates that there are no callbacks waiting on the next +RCU grace period. +The ->tails[RCU_NEXT_TAIL] array element references +CB 4's ->next pointer, indicating that all the +remaining RCU callbacks have not yet been assigned to an RCU grace +period. +Note that the ->tails[RCU_NEXT_TAIL] array element +always references the last RCU callback's ->next pointer +unless the callback list is empty, in which case it references +the ->head pointer. + +

    +There is one additional important special case for the +->tails[RCU_NEXT_TAIL] array element: It can be NULL +when this list is disabled. +Lists are disabled when the corresponding CPU is offline or when +the corresponding CPU's callbacks are offloaded to a kthread, +both of which are described elsewhere. + +

    CPUs advance their callbacks from the +RCU_NEXT_TAIL to the RCU_NEXT_READY_TAIL to the +RCU_WAIT_TAIL to the RCU_DONE_TAIL list segments +as grace periods advance. + +

    The ->gp_seq[] array records grace-period +numbers corresponding to the list segments. +This is what allows different CPUs to have different ideas as to +which is the current grace period while still avoiding premature +invocation of their callbacks. +In particular, this allows CPUs that go idle for extended periods +to determine which of their callbacks are ready to be invoked after +reawakening. + +

    The ->len counter contains the number of +callbacks in ->head, and the +->len_lazy contains the number of those callbacks that +are known to only free memory, and whose invocation can therefore +be safely deferred. + +

    Important note: It is the ->len field that +determines whether or not there are callbacks associated with +this rcu_segcblist structure, not the ->head +pointer. +The reason for this is that all the ready-to-invoke callbacks +(that is, those in the RCU_DONE_TAIL segment) are extracted +all at once at callback-invocation time. +If callback invocation must be postponed, for example, because a +high-priority process just woke up on this CPU, then the remaining +callbacks are placed back on the RCU_DONE_TAIL segment. +Either way, the ->len and ->len_lazy counts +are adjusted after the corresponding callbacks have been invoked, and so +again it is the ->len count that accurately reflects whether +or not there are callbacks associated with this rcu_segcblist +structure. +Of course, off-CPU sampling of the ->len count requires +the use of appropriate synchronization, for example, memory barriers. +This synchronization can be a bit subtle, particularly in the case +of rcu_barrier(). +

    The rcu_data Structure

    @@ -983,62 +1113,18 @@ choice. as follows:
    - 1 struct rcu_head *nxtlist;
    - 2 struct rcu_head **nxttail[RCU_NEXT_SIZE];
    - 3 unsigned long nxtcompleted[RCU_NEXT_SIZE];
    - 4 long qlen_lazy;
    - 5 long qlen;
    - 6 long qlen_last_fqs_check;
    + 1 struct rcu_segcblist cblist;
    + 2 long qlen_last_fqs_check;
    + 3 unsigned long n_cbs_invoked;
    + 4 unsigned long n_nocbs_invoked;
    + 5 unsigned long n_cbs_orphaned;
    + 6 unsigned long n_cbs_adopted;
      7 unsigned long n_force_qs_snap;
    - 8 unsigned long n_cbs_invoked;
    - 9 unsigned long n_cbs_orphaned;
    -10 unsigned long n_cbs_adopted;
    -11 long blimit;
    + 8 long blimit;
     
    -

    The ->nxtlist pointer and the -->nxttail[] array form a four-segment list with -older callbacks near the head and newer ones near the tail. -Each segment contains callbacks with the corresponding relationship -to the current grace period. -The pointer out of the end of each of the four segments is referenced -by the element of the ->nxttail[] array indexed by -RCU_DONE_TAIL (for callbacks handled by a prior grace period), -RCU_WAIT_TAIL (for callbacks waiting on the current grace period), -RCU_NEXT_READY_TAIL (for callbacks that will wait on the next -grace period), and -RCU_NEXT_TAIL (for callbacks that are not yet associated -with a specific grace period) -respectively, as shown in the following figure. - -

    nxtlist.svg - -

    In this figure, the ->nxtlist pointer references the -first -RCU callback in the list. -The ->nxttail[RCU_DONE_TAIL] array element references -the ->nxtlist pointer itself, indicating that none -of the callbacks is ready to invoke. -The ->nxttail[RCU_WAIT_TAIL] array element references callback -CB 2's ->next pointer, which indicates that -CB 1 and CB 2 are both waiting on the current grace period. -The ->nxttail[RCU_NEXT_READY_TAIL] array element -references the same RCU callback that ->nxttail[RCU_WAIT_TAIL] -does, which indicates that there are no callbacks waiting on the next -RCU grace period. -The ->nxttail[RCU_NEXT_TAIL] array element references -CB 4's ->next pointer, indicating that all the -remaining RCU callbacks have not yet been assigned to an RCU grace -period. -Note that the ->nxttail[RCU_NEXT_TAIL] array element -always references the last RCU callback's ->next pointer -unless the callback list is empty, in which case it references -the ->nxtlist pointer. - -

    CPUs advance their callbacks from the -RCU_NEXT_TAIL to the RCU_NEXT_READY_TAIL to the -RCU_WAIT_TAIL to the RCU_DONE_TAIL list segments -as grace periods advance. +

    The ->cblist structure is the segmented callback list +described earlier. The CPU advances the callbacks in its rcu_data structure whenever it notices that another RCU grace period has completed. The CPU detects the completion of an RCU grace period by noticing @@ -1049,16 +1135,7 @@ Recall that each rcu_node structure's ->completed field is updated at the end of each grace period. -

    The ->nxtcompleted[] array records grace-period -numbers corresponding to the list segments. -This allows CPUs that go idle for extended periods to determine -which of their callbacks are ready to be invoked after reawakening. - -

    The ->qlen counter contains the number of -callbacks in ->nxtlist, and the -->qlen_lazy contains the number of those callbacks that -are known to only free memory, and whose invocation can therefore -be safely deferred. +

    The ->qlen_last_fqs_check and ->n_force_qs_snap coordinate the forcing of quiescent states from call_rcu() and friends when callback @@ -1069,6 +1146,10 @@ lists grow excessively long. fields count the number of callbacks invoked, sent to other CPUs when this CPU goes offline, and received from other CPUs when those other CPUs go offline. +The ->n_nocbs_invoked is used when the CPU's callbacks +are offloaded to a kthread. + +

    Finally, the ->blimit counter is the maximum number of RCU callbacks that may be invoked at a given time. -- cgit v1.2.3 From abb06b99484a9f5af05c7147c289faf835f68e8e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 26 Jan 2017 13:45:38 -0800 Subject: rcu: Pull rcu_sched_qs_mask into rcu_dynticks structure The rcu_sched_qs_mask variable is yet another isolated per-CPU variable, so this commit pulls it into the pre-existing rcu_dynticks per-CPU structure. Signed-off-by: Paul E. McKenney --- .../RCU/Design/Data-Structures/Data-Structures.html | 9 ++++++++- kernel/rcu/tree.c | 12 +++++------- kernel/rcu/tree.h | 1 + 3 files changed, 14 insertions(+), 8 deletions(-) (limited to 'Documentation/RCU/Design/Data-Structures/Data-Structures.html') diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html index d583c653a703..bf7f266e8888 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html @@ -1104,6 +1104,7 @@ Its fields are as follows: 1 int dynticks_nesting; 2 int dynticks_nmi_nesting; 3 atomic_t dynticks; + 4 int rcu_sched_qs_mask;

    The ->dynticks_nesting field counts the @@ -1117,11 +1118,17 @@ NMIs are counted by the ->dynticks_nmi_nesting field, except that NMIs that interrupt non-dyntick-idle execution are not counted. -

    Finally, the ->dynticks field counts the corresponding +

    The ->dynticks field counts the corresponding CPU's transitions to and from dyntick-idle mode, so that this counter has an even value when the CPU is in dyntick-idle mode and an odd value otherwise. +

    Finally, the ->rcu_sched_qs_mask field is used +to record the fact that the RCU core code would really like to +see a quiescent state from the corresponding CPU. +This flag is checked by RCU's context-switch and cond_resched() +code, which provide a momentary idle sojourn in response. + diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3747277aae67..3a0703035874 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -272,8 +272,6 @@ void rcu_bh_qs(void) } } -static DEFINE_PER_CPU(int, rcu_sched_qs_mask); - /* * Steal a bit from the bottom of ->dynticks for idle entry/exit * control. Initially this is for TLB flushing. @@ -464,8 +462,8 @@ static void rcu_momentary_dyntick_idle(void) * Yes, we can lose flag-setting operations. This is OK, because * the flag will be set again after some delay. */ - resched_mask = raw_cpu_read(rcu_sched_qs_mask); - raw_cpu_write(rcu_sched_qs_mask, 0); + resched_mask = raw_cpu_read(rcu_dynticks.rcu_sched_qs_mask); + raw_cpu_write(rcu_dynticks.rcu_sched_qs_mask, 0); /* Find the flavor that needs a quiescent state. */ for_each_rcu_flavor(rsp) { @@ -499,7 +497,7 @@ void rcu_note_context_switch(void) trace_rcu_utilization(TPS("Start context switch")); rcu_sched_qs(); rcu_preempt_note_context_switch(); - if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) + if (unlikely(raw_cpu_read(rcu_dynticks.rcu_sched_qs_mask))) rcu_momentary_dyntick_idle(); trace_rcu_utilization(TPS("End context switch")); barrier(); /* Avoid RCU read-side critical sections leaking up. */ @@ -524,7 +522,7 @@ void rcu_all_qs(void) unsigned long flags; barrier(); /* Avoid RCU read-side critical sections leaking down. */ - if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) { + if (unlikely(raw_cpu_read(rcu_dynticks.rcu_sched_qs_mask))) { local_irq_save(flags); rcu_momentary_dyntick_idle(); local_irq_restore(flags); @@ -1351,7 +1349,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, * is set too high, we override with half of the RCU CPU stall * warning delay. */ - rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu); + rcrmp = &per_cpu(rcu_dynticks.rcu_sched_qs_mask, rdp->cpu); if (time_after(jiffies, rdp->rsp->gp_start + jtsq) || time_after(jiffies, rdp->rsp->jiffies_resched)) { if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 7468b4de7e0c..e298281984dc 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -113,6 +113,7 @@ struct rcu_dynticks { /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ + int rcu_sched_qs_mask; /* GP old, need quiescent state. */ #ifdef CONFIG_NO_HZ_FULL_SYSIDLE long long dynticks_idle_nesting; /* irq/process nesting level from idle. */ -- cgit v1.2.3 From 9577df9a3122af08fff84b8a1a60dccf524a3891 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 26 Jan 2017 16:18:07 -0800 Subject: rcu: Pull rcu_qs_ctr into rcu_dynticks structure The rcu_qs_ctr variable is yet another isolated per-CPU variable, so this commit pulls it into the pre-existing rcu_dynticks per-CPU structure. Signed-off-by: Paul E. McKenney --- .../RCU/Design/Data-Structures/Data-Structures.html | 12 ++++++++++-- kernel/rcu/tree.c | 15 ++++++--------- kernel/rcu/tree.h | 3 ++- kernel/rcu/tree_trace.c | 4 +--- 4 files changed, 19 insertions(+), 15 deletions(-) (limited to 'Documentation/RCU/Design/Data-Structures/Data-Structures.html') diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html index bf7f266e8888..3d0311657533 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html @@ -1105,6 +1105,7 @@ Its fields are as follows: 2 int dynticks_nmi_nesting; 3 atomic_t dynticks; 4 int rcu_sched_qs_mask; + 5 unsigned long rcu_qs_ctr;

    The ->dynticks_nesting field counts the @@ -1123,12 +1124,19 @@ CPU's transitions to and from dyntick-idle mode, so that this counter has an even value when the CPU is in dyntick-idle mode and an odd value otherwise. -

    Finally, the ->rcu_sched_qs_mask field is used +

    The ->rcu_sched_qs_mask field is used to record the fact that the RCU core code would really like to -see a quiescent state from the corresponding CPU. +see a quiescent state from the corresponding CPU, so much so that +it is willing to call for heavy-weight dyntick-counter operations. This flag is checked by RCU's context-switch and cond_resched() code, which provide a momentary idle sojourn in response. +

    Finally the ->rcu_qs_ctr field is used to record +quiescent states from cond_resched(). +Because cond_resched() can execute quite frequently, this +must be quite lightweight, as in a non-atomic increment of this +per-CPU field. +

     
    Quick Quiz:
    diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3a0703035874..82a86a67c92a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -436,9 +436,6 @@ bool rcu_eqs_special_set(int cpu) return true; } -DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); -EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); - /* * Let the RCU core know that this CPU has gone through the scheduler, * which is a quiescent state. This is called when the need for a @@ -542,7 +539,7 @@ void rcu_all_qs(void) rcu_sched_qs(); preempt_enable(); } - this_cpu_inc(rcu_qs_ctr); + this_cpu_inc(rcu_dynticks.rcu_qs_ctr); barrier(); /* Avoid RCU read-side critical sections leaking up. */ } EXPORT_SYMBOL_GPL(rcu_all_qs); @@ -1315,7 +1312,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, */ rnp = rdp->mynode; if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && - READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_qs_ctr, rdp->cpu) && + READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) && READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); return 1; @@ -2024,7 +2021,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); need_gp = !!(rnp->qsmask & rdp->grpmask); rdp->cpu_no_qs.b.norm = need_gp; - rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); rdp->core_needs_qs = need_gp; zero_cpu_stall_ticks(rdp); WRITE_ONCE(rdp->gpwrap, false); @@ -2622,7 +2619,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * within the current grace period. */ rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ - rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } @@ -3620,7 +3617,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && - rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) { rdp->n_rp_core_needs_qs++; } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) { rdp->n_rp_report_qs++; @@ -3933,7 +3930,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ rdp->completed = rnp->completed; rdp->cpu_no_qs.b.norm = true; - rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); + rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu); rdp->core_needs_qs = false; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e298281984dc..76e4467bc765 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -113,7 +113,8 @@ struct rcu_dynticks { /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ - int rcu_sched_qs_mask; /* GP old, need quiescent state. */ + int rcu_sched_qs_mask; /* GP old, need heavy quiescent state. */ + unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ #ifdef CONFIG_NO_HZ_FULL_SYSIDLE long long dynticks_idle_nesting; /* irq/process nesting level from idle. */ diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 8751a748499a..65b43be38e68 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -45,8 +45,6 @@ #define RCU_TREE_NONCORE #include "tree.h" -DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); - static int r_open(struct inode *inode, struct file *file, const struct seq_operations *op) { @@ -121,7 +119,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) cpu_is_offline(rdp->cpu) ? '!' : ' ', ulong2long(rdp->completed), ulong2long(rdp->gpnum), rdp->cpu_no_qs.b.norm, - rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), + rdp->rcu_qs_ctr_snap == per_cpu(rdp->dynticks->rcu_qs_ctr, rdp->cpu), rdp->core_needs_qs); seq_printf(m, " dt=%d/%llx/%d df=%lu", rcu_dynticks_snap(rdp->dynticks), -- cgit v1.2.3 From 0f9be8cabbc343218dd2807af7308656be113045 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Jan 2017 13:17:02 -0800 Subject: rcu: Eliminate flavor scan in rcu_momentary_dyntick_idle() The rcu_momentary_dyntick_idle() function scans the RCU flavors, checking that one of them still needs a quiescent state before doing an expensive atomic operation on the ->dynticks counter. However, this check reduces overhead only after a rare race condition, and increases complexity. This commit therefore removes the scan and the mechanism enabling the scan. Signed-off-by: Paul E. McKenney --- .../Design/Data-Structures/Data-Structures.html | 4 +- kernel/rcu/tree.c | 62 +++++----------------- kernel/rcu/tree.h | 3 +- 3 files changed, 15 insertions(+), 54 deletions(-) (limited to 'Documentation/RCU/Design/Data-Structures/Data-Structures.html') diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html index 3d0311657533..e4bf20a68fa3 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html @@ -1104,7 +1104,7 @@ Its fields are as follows: 1 int dynticks_nesting; 2 int dynticks_nmi_nesting; 3 atomic_t dynticks; - 4 int rcu_sched_qs_mask; + 4 bool rcu_need_heavy_qs; 5 unsigned long rcu_qs_ctr; @@ -1124,7 +1124,7 @@ CPU's transitions to and from dyntick-idle mode, so that this counter has an even value when the CPU is in dyntick-idle mode and an odd value otherwise. -

    The ->rcu_sched_qs_mask field is used +

    The ->rcu_need_heavy_qs field is used to record the fact that the RCU core code would really like to see a quiescent state from the corresponding CPU, so much so that it is willing to call for heavy-weight dyntick-counter operations. diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 82a86a67c92a..c2cbc78a0625 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -443,44 +443,14 @@ bool rcu_eqs_special_set(int cpu) * memory barriers to let the RCU core know about it, regardless of what * this CPU might (or might not) do in the near future. * - * We inform the RCU core by emulating a zero-duration dyntick-idle - * period, which we in turn do by incrementing the ->dynticks counter - * by two. + * We inform the RCU core by emulating a zero-duration dyntick-idle period. * * The caller must have disabled interrupts. */ static void rcu_momentary_dyntick_idle(void) { - struct rcu_data *rdp; - int resched_mask; - struct rcu_state *rsp; - - /* - * Yes, we can lose flag-setting operations. This is OK, because - * the flag will be set again after some delay. - */ - resched_mask = raw_cpu_read(rcu_dynticks.rcu_sched_qs_mask); - raw_cpu_write(rcu_dynticks.rcu_sched_qs_mask, 0); - - /* Find the flavor that needs a quiescent state. */ - for_each_rcu_flavor(rsp) { - rdp = raw_cpu_ptr(rsp->rda); - if (!(resched_mask & rsp->flavor_mask)) - continue; - smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */ - if (READ_ONCE(rdp->mynode->completed) != - READ_ONCE(rdp->cond_resched_completed)) - continue; - - /* - * Pretend to be momentarily idle for the quiescent state. - * This allows the grace-period kthread to record the - * quiescent state, with no need for this CPU to do anything - * further. - */ - rcu_dynticks_momentary_idle(); - break; - } + raw_cpu_write(rcu_dynticks.rcu_need_heavy_qs, false); + rcu_dynticks_momentary_idle(); } /* @@ -494,7 +464,7 @@ void rcu_note_context_switch(void) trace_rcu_utilization(TPS("Start context switch")); rcu_sched_qs(); rcu_preempt_note_context_switch(); - if (unlikely(raw_cpu_read(rcu_dynticks.rcu_sched_qs_mask))) + if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) rcu_momentary_dyntick_idle(); trace_rcu_utilization(TPS("End context switch")); barrier(); /* Avoid RCU read-side critical sections leaking up. */ @@ -519,7 +489,7 @@ void rcu_all_qs(void) unsigned long flags; barrier(); /* Avoid RCU read-side critical sections leaking down. */ - if (unlikely(raw_cpu_read(rcu_dynticks.rcu_sched_qs_mask))) { + if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) { local_irq_save(flags); rcu_momentary_dyntick_idle(); local_irq_restore(flags); @@ -1275,7 +1245,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, bool *isidle, unsigned long *maxj) { unsigned long jtsq; - int *rcrmp; + bool *rnhqp; unsigned long rjtsc; struct rcu_node *rnp; @@ -1332,7 +1302,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, * in-kernel CPU-bound tasks cannot advance grace periods. * So if the grace period is old enough, make the CPU pay attention. * Note that the unsynchronized assignments to the per-CPU - * rcu_sched_qs_mask variable are safe. Yes, setting of + * rcu_need_heavy_qs variable are safe. Yes, setting of * bits can be lost, but they will be set again on the next * force-quiescent-state pass. So lost bit sets do not result * in incorrect behavior, merely in a grace period lasting @@ -1346,16 +1316,11 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, * is set too high, we override with half of the RCU CPU stall * warning delay. */ - rcrmp = &per_cpu(rcu_dynticks.rcu_sched_qs_mask, rdp->cpu); - if (time_after(jiffies, rdp->rsp->gp_start + jtsq) || - time_after(jiffies, rdp->rsp->jiffies_resched)) { - if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { - WRITE_ONCE(rdp->cond_resched_completed, - READ_ONCE(rdp->mynode->completed)); - smp_mb(); /* ->cond_resched_completed before *rcrmp. */ - WRITE_ONCE(*rcrmp, - READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask); - } + rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu); + if (!READ_ONCE(*rnhqp) && + (time_after(jiffies, rdp->rsp->gp_start + jtsq) || + time_after(jiffies, rdp->rsp->jiffies_resched))) { + WRITE_ONCE(*rnhqp, true); rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ } @@ -4169,7 +4134,6 @@ static void __init rcu_init_one(struct rcu_state *rsp) static const char * const fqs[] = RCU_FQS_NAME_INIT; static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; - static u8 fl_mask = 0x1; int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ @@ -4191,8 +4155,6 @@ static void __init rcu_init_one(struct rcu_state *rsp) for (i = 1; i < rcu_num_lvls; i++) rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1]; rcu_init_levelspread(levelspread, levelcnt); - rsp->flavor_mask = fl_mask; - fl_mask <<= 1; /* Initialize the elements themselves, starting from the leaves. */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 76e4467bc765..b212cd0f22c7 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -113,7 +113,7 @@ struct rcu_dynticks { /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ - int rcu_sched_qs_mask; /* GP old, need heavy quiescent state. */ + bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ #ifdef CONFIG_NO_HZ_FULL_SYSIDLE long long dynticks_idle_nesting; @@ -484,7 +484,6 @@ struct rcu_state { struct rcu_node *level[RCU_NUM_LVLS + 1]; /* Hierarchy levels (+1 to */ /* shut bogus gcc warning) */ - u8 flavor_mask; /* bit in flavor mask. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ call_rcu_func_t call; /* call_rcu() flavor. */ int ncpus; /* # CPUs seen so far. */ -- cgit v1.2.3 From 9226b10d78ffe7895549045fe388dc5e73b87eac Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Jan 2017 14:17:50 -0800 Subject: rcu: Place guard on rcu_all_qs() and rcu_note_context_switch() actions The rcu_all_qs() and rcu_note_context_switch() do a series of checks, taking various actions to supply RCU with quiescent states, depending on the outcomes of the various checks. This is a bit much for scheduling fastpaths, so this commit creates a separate ->rcu_urgent_qs field in the rcu_dynticks structure that acts as a global guard for these checks. Thus, in the common case, rcu_all_qs() and rcu_note_context_switch() check the ->rcu_urgent_qs field, find it false, and simply return. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra --- .../Design/Data-Structures/Data-Structures.html | 11 ++++++- kernel/rcu/tree.c | 38 ++++++++++++++-------- kernel/rcu/tree.h | 3 +- kernel/rcu/tree_exp.h | 2 ++ kernel/rcu/tree_plugin.h | 8 +++-- 5 files changed, 44 insertions(+), 18 deletions(-) (limited to 'Documentation/RCU/Design/Data-Structures/Data-Structures.html') diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html index e4bf20a68fa3..4dec89097559 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html @@ -1106,6 +1106,7 @@ Its fields are as follows: 3 atomic_t dynticks; 4 bool rcu_need_heavy_qs; 5 unsigned long rcu_qs_ctr; + 6 bool rcu_urgent_qs;

    The ->dynticks_nesting field counts the @@ -1131,12 +1132,20 @@ it is willing to call for heavy-weight dyntick-counter operations. This flag is checked by RCU's context-switch and cond_resched() code, which provide a momentary idle sojourn in response. -

    Finally the ->rcu_qs_ctr field is used to record +

    The ->rcu_qs_ctr field is used to record quiescent states from cond_resched(). Because cond_resched() can execute quite frequently, this must be quite lightweight, as in a non-atomic increment of this per-CPU field. +

    Finally, the ->rcu_urgent_qs field is used to record +the fact that the RCU core code would really like to see a quiescent +state from the corresponding CPU, with the various other fields indicating +just how badly RCU wants this quiescent state. +This flag is checked by RCU's context-switch and cond_resched() +code, which, if nothing else, non-atomically increment ->rcu_qs_ctr +in response. +

     
    Quick Quiz:
    diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c2cbc78a0625..530ab6cf7a0b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -464,8 +464,14 @@ void rcu_note_context_switch(void) trace_rcu_utilization(TPS("Start context switch")); rcu_sched_qs(); rcu_preempt_note_context_switch(); + /* Load rcu_urgent_qs before other flags. */ + if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) + goto out; + this_cpu_write(rcu_dynticks.rcu_urgent_qs, false); if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) rcu_momentary_dyntick_idle(); + this_cpu_inc(rcu_dynticks.rcu_qs_ctr); +out: trace_rcu_utilization(TPS("End context switch")); barrier(); /* Avoid RCU read-side critical sections leaking up. */ } @@ -488,29 +494,26 @@ void rcu_all_qs(void) { unsigned long flags; + if (!raw_cpu_read(rcu_dynticks.rcu_urgent_qs)) + return; + preempt_disable(); + /* Load rcu_urgent_qs before other flags. */ + if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) { + preempt_enable(); + return; + } + this_cpu_write(rcu_dynticks.rcu_urgent_qs, false); barrier(); /* Avoid RCU read-side critical sections leaking down. */ if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) { local_irq_save(flags); rcu_momentary_dyntick_idle(); local_irq_restore(flags); } - if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) { - /* - * Yes, we just checked a per-CPU variable with preemption - * enabled, so we might be migrated to some other CPU at - * this point. That is OK because in that case, the - * migration will supply the needed quiescent state. - * We might end up needlessly disabling preemption and - * invoking rcu_sched_qs() on the destination CPU, but - * the probability and cost are both quite low, so this - * should not be a problem in practice. - */ - preempt_disable(); + if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) rcu_sched_qs(); - preempt_enable(); - } this_cpu_inc(rcu_dynticks.rcu_qs_ctr); barrier(); /* Avoid RCU read-side critical sections leaking up. */ + preempt_enable(); } EXPORT_SYMBOL_GPL(rcu_all_qs); @@ -1246,6 +1249,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, { unsigned long jtsq; bool *rnhqp; + bool *ruqp; unsigned long rjtsc; struct rcu_node *rnp; @@ -1281,11 +1285,15 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, * might not be the case for nohz_full CPUs looping in the kernel. */ rnp = rdp->mynode; + ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) && READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); return 1; + } else { + /* Load rcu_qs_ctr before store to rcu_urgent_qs. */ + smp_store_release(ruqp, true); } /* Check for the CPU being offline. */ @@ -1321,6 +1329,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, (time_after(jiffies, rdp->rsp->gp_start + jtsq) || time_after(jiffies, rdp->rsp->jiffies_resched))) { WRITE_ONCE(*rnhqp, true); + /* Store rcu_need_heavy_qs before rcu_urgent_qs. */ + smp_store_release(ruqp, true); rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index b212cd0f22c7..d2f276fc2edc 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -113,8 +113,9 @@ struct rcu_dynticks { /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ - bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ + bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ + bool rcu_urgent_qs; /* GP old need light quiescent state. */ #ifdef CONFIG_NO_HZ_FULL_SYSIDLE long long dynticks_idle_nesting; /* irq/process nesting level from idle. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index a7b639ccd46e..a1f52bbe9db6 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -331,6 +331,8 @@ static void sync_sched_exp_handler(void *data) return; } __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); + /* Store .exp before .rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); resched_cpu(smp_processor_id()); } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0a62a8f1caac..621296a6694b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1860,7 +1860,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else { - rdp->nocb_defer_wakeup = RCU_NOGP_WAKE; + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE); + /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmptyIsDeferred")); } @@ -1872,7 +1874,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { - rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE; + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_FORCE); + /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvfIsDeferred")); } -- cgit v1.2.3
     
    Quick Quiz: