From 8be6e1b15c54402106e6ba9bc706e685458b2d2d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 29 Apr 2017 20:03:20 -0700 Subject: rcu: Use timer as backstop for NOCB deferred wakeups The handling of RCU's no-CBs CPUs has a maintenance headache, namely that if call_rcu() is invoked with interrupts disabled, the rcuo kthread wakeup must be defered to a point where we can be sure that scheduler locks are not held. Of course, there are a lot of code paths leading from an interrupts-disabled invocation of call_rcu(), and missing any one of these can result in excessive callback-invocation latency, and potentially even system hangs. This commit therefore uses a timer to guarantee that the wakeup will eventually occur. If one of the deferred-wakeup points kicks in, then the timer is simply cancelled. This commit also fixes up an incomplete removal of commits that were intended to plug remaining exit paths, which should have the added benefit of reducing the overhead of RCU's context-switch hooks. In addition, it simplifies leader-to-follower callback-list handoff by introducing locking. The call_rcu()-to-leader handoff continues to use atomic operations in order to maintain good real-time latency for common-case use of call_rcu(). Signed-off-by: Paul E. McKenney [ paulmck: Dan Carpenter fix for mod_timer() usage bug found by smatch. ] --- kernel/rcu/tree_plugin.h | 179 +++++++++++++++++++++++++++++------------------ 1 file changed, 109 insertions(+), 70 deletions(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 908b309d60d7..bb9e6e43130f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1788,22 +1788,61 @@ bool rcu_is_nocb_cpu(int cpu) } /* - * Kick the leader kthread for this NOCB group. + * Kick the leader kthread for this NOCB group. Caller holds ->nocb_lock + * and this function releases it. */ -static void wake_nocb_leader(struct rcu_data *rdp, bool force) +static void __wake_nocb_leader(struct rcu_data *rdp, bool force, + unsigned long flags) + __releases(rdp->nocb_lock) { struct rcu_data *rdp_leader = rdp->nocb_leader; - if (!READ_ONCE(rdp_leader->nocb_kthread)) + lockdep_assert_held(&rdp->nocb_lock); + if (!READ_ONCE(rdp_leader->nocb_kthread)) { + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); return; - if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { + } + if (rdp_leader->nocb_leader_sleep || force) { /* Prior smp_mb__after_atomic() orders against prior enqueue. */ WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); + del_timer(&rdp->nocb_timer); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); smp_mb(); /* ->nocb_leader_sleep before swake_up(). */ swake_up(&rdp_leader->nocb_wq); + } else { + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } } +/* + * Kick the leader kthread for this NOCB group, but caller has not + * acquired locks. + */ +static void wake_nocb_leader(struct rcu_data *rdp, bool force) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + __wake_nocb_leader(rdp, force, flags); +} + +/* + * Arrange to wake the leader kthread for this NOCB group at some + * future time when it is safe to do so. + */ +static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype, + const char *reason) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) + mod_timer(&rdp->nocb_timer, jiffies + 1); + WRITE_ONCE(rdp->nocb_defer_wakeup, waketype); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, reason); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); +} + /* * Does the specified CPU need an RCU callback for the specified flavor * of rcu_barrier()? @@ -1891,11 +1930,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else { - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE); - /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ - smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WakeEmptyIsDeferred")); + wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, + TPS("WakeEmptyIsDeferred")); } rdp->qlen_last_fqs_check = 0; } else if (len > rdp->qlen_last_fqs_check + qhimark) { @@ -1905,11 +1941,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE); - /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ - smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WakeOvfIsDeferred")); + wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, + TPS("WakeOvfIsDeferred")); } rdp->qlen_last_fqs_check = LONG_MAX / 2; } else { @@ -2031,6 +2064,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) static void nocb_leader_wait(struct rcu_data *my_rdp) { bool firsttime = true; + unsigned long flags; bool gotcbs; struct rcu_data *rdp; struct rcu_head **tail; @@ -2042,7 +2076,11 @@ wait_again: trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); swait_event_interruptible(my_rdp->nocb_wq, !READ_ONCE(my_rdp->nocb_leader_sleep)); - /* Memory barrier handled by smp_mb() calls below and repoll. */ + raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); + my_rdp->nocb_leader_sleep = true; + WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + del_timer(&my_rdp->nocb_timer); + raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); } else if (firsttime) { firsttime = false; /* Don't drown trace log with "Poll"! */ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll"); @@ -2054,7 +2092,7 @@ wait_again: * nocb_gp_head, where they await a grace period. */ gotcbs = false; - smp_mb(); /* wakeup before ->nocb_head reads. */ + smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); if (!rdp->nocb_gp_head) @@ -2066,56 +2104,41 @@ wait_again: gotcbs = true; } - /* - * If there were no callbacks, sleep a bit, rescan after a - * memory barrier, and go retry. - */ + /* No callbacks? Sleep a bit if polling, and go retry. */ if (unlikely(!gotcbs)) { - if (!rcu_nocb_poll) + WARN_ON(signal_pending(current)); + if (rcu_nocb_poll) { + schedule_timeout_interruptible(1); + } else { trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "WokeEmpty"); - WARN_ON(signal_pending(current)); - schedule_timeout_interruptible(1); - - /* Rescan in case we were a victim of memory ordering. */ - my_rdp->nocb_leader_sleep = true; - smp_mb(); /* Ensure _sleep true before scan. */ - for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) - if (READ_ONCE(rdp->nocb_head)) { - /* Found CB, so short-circuit next wait. */ - my_rdp->nocb_leader_sleep = false; - break; - } + } goto wait_again; } /* Wait for one grace period. */ rcu_nocb_wait_gp(my_rdp); - /* - * We left ->nocb_leader_sleep unset to reduce cache thrashing. - * We set it now, but recheck for new callbacks while - * traversing our follower list. - */ - my_rdp->nocb_leader_sleep = true; - smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */ - /* Each pass through the following loop wakes a follower, if needed. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { - if (READ_ONCE(rdp->nocb_head)) + if (!rcu_nocb_poll && + READ_ONCE(rdp->nocb_head) && + READ_ONCE(my_rdp->nocb_leader_sleep)) { + raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/ + raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); + } if (!rdp->nocb_gp_head) continue; /* No CBs, so no need to wake follower. */ /* Append callbacks to follower's "done" list. */ - tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail); + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + tail = rdp->nocb_follower_tail; + rdp->nocb_follower_tail = rdp->nocb_gp_tail; *tail = rdp->nocb_gp_head; - smp_mb__after_atomic(); /* Store *tail before wakeup. */ + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { - /* - * List was empty, wake up the follower. - * Memory barriers supplied by atomic_long_add(). - */ + /* List was empty, so wake up the follower. */ swake_up(&rdp->nocb_wq); } } @@ -2131,28 +2154,16 @@ wait_again: */ static void nocb_follower_wait(struct rcu_data *rdp) { - bool firsttime = true; - for (;;) { - if (!rcu_nocb_poll) { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - "FollowerSleep"); - swait_event_interruptible(rdp->nocb_wq, - READ_ONCE(rdp->nocb_follower_head)); - } else if (firsttime) { - /* Don't drown trace log with "Poll"! */ - firsttime = false; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll"); - } + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "FollowerSleep"); + swait_event_interruptible(rdp->nocb_wq, + READ_ONCE(rdp->nocb_follower_head)); if (smp_load_acquire(&rdp->nocb_follower_head)) { /* ^^^ Ensure CB invocation follows _head test. */ return; } - if (!rcu_nocb_poll) - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - "WokeEmpty"); WARN_ON(signal_pending(current)); - schedule_timeout_interruptible(1); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeEmpty"); } } @@ -2165,6 +2176,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) static int rcu_nocb_kthread(void *arg) { int c, cl; + unsigned long flags; struct rcu_head *list; struct rcu_head *next; struct rcu_head **tail; @@ -2179,11 +2191,14 @@ static int rcu_nocb_kthread(void *arg) nocb_follower_wait(rdp); /* Pull the ready-to-invoke callbacks onto local list. */ - list = READ_ONCE(rdp->nocb_follower_head); + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + list = rdp->nocb_follower_head; + rdp->nocb_follower_head = NULL; + tail = rdp->nocb_follower_tail; + rdp->nocb_follower_tail = &rdp->nocb_follower_head; + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); BUG_ON(!list); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); - WRITE_ONCE(rdp->nocb_follower_head, NULL); - tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head); /* Each pass through the following loop invokes a callback. */ trace_rcu_batch_start(rdp->rsp->name, @@ -2226,18 +2241,39 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) } /* Do a deferred wakeup of rcu_nocb_kthread(). */ -static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp) { + unsigned long flags; int ndw; - if (!rcu_nocb_need_deferred_wakeup(rdp)) + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + if (!rcu_nocb_need_deferred_wakeup(rdp)) { + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); return; + } ndw = READ_ONCE(rdp->nocb_defer_wakeup); WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE); + __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake")); } +/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ +static void do_nocb_deferred_wakeup_timer(unsigned long x) +{ + do_nocb_deferred_wakeup_common((struct rcu_data *)x); +} + +/* + * Do a deferred wakeup of rcu_nocb_kthread() from fastpath. + * This means we do an inexact common-case check. Note that if + * we miss, ->nocb_timer will eventually clean things up. + */ +static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +{ + if (rcu_nocb_need_deferred_wakeup(rdp)) + do_nocb_deferred_wakeup_common(rdp); +} + void __init rcu_init_nohz(void) { int cpu; @@ -2287,6 +2323,9 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) rdp->nocb_tail = &rdp->nocb_head; init_swait_queue_head(&rdp->nocb_wq); rdp->nocb_follower_tail = &rdp->nocb_follower_head; + raw_spin_lock_init(&rdp->nocb_lock); + setup_timer(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, + (unsigned long)rdp); } /* -- cgit v1.2.3 From b1a2d79fe7d210c114003362d93d529912d244df Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 26 Jun 2017 12:23:46 -0700 Subject: rcu: Make NOCB CPUs migrate CBs directly from outgoing CPU RCU's CPU-hotplug callback-migration code first moves the outgoing CPU's callbacks to ->orphan_done and ->orphan_pend, and only then moves them to the NOCB callback list. This commit avoids the extra step (and simplifies the code) by moving the callbacks directly from the outgoing CPU's callback list to the NOCB callback list. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 14 ++++++++------ kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 31 ++++++++++--------------------- 3 files changed, 19 insertions(+), 28 deletions(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index aeea697d6f9f..4ea28e820f4a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3899,11 +3899,6 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) lockdep_assert_held(&rsp->orphan_lock); - /* No-CBs CPUs are handled specially. */ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || - rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) - return; - /* Do the accounting first. */ if (rsp->orphan_done.len_lazy != rsp->orphan_done.len) rcu_idle_count_callbacks_posted(); @@ -3928,13 +3923,20 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) { unsigned long flags; + struct rcu_data *my_rdp; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) return; /* No callbacks to migrate. */ - raw_spin_lock_irqsave(&rsp->orphan_lock, flags); + local_irq_save(flags); + my_rdp = this_cpu_ptr(rsp->rda); + if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) { + local_irq_restore(flags); + return; + } + raw_spin_lock(&rsp->orphan_lock); /* irqs already disabled. */ rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); rcu_adopt_orphan_cbs(rsp, flags); raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index aec53c4d4aec..574513cf49b4 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -493,7 +493,7 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy, unsigned long flags); -static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, +static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, struct rcu_data *rdp, unsigned long flags); static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 908b309d60d7..ff7d5ee49816 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1961,30 +1961,19 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is * not a no-CBs CPU. */ -static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, +static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, struct rcu_data *rdp, unsigned long flags) { - long ql = rsp->orphan_done.len; - long qll = rsp->orphan_done.len_lazy; - - /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_nocb_adopt_orphan_cbs() invoked with irqs enabled!!!"); if (!rcu_is_nocb_cpu(smp_processor_id())) - return false; - - /* First, enqueue the donelist, if any. This preserves CB ordering. */ - if (rsp->orphan_done.head) { - __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done), - rcu_cblist_tail(&rsp->orphan_done), - ql, qll, flags); - } - if (rsp->orphan_pend.head) { - __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend), - rcu_cblist_tail(&rsp->orphan_pend), - ql, qll, flags); - } - rcu_cblist_init(&rsp->orphan_done); - rcu_cblist_init(&rsp->orphan_pend); + return false; /* Not NOCBs CPU, caller must migrate CBs. */ + __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist), + rcu_segcblist_tail(&rdp->cblist), + rcu_segcblist_n_cbs(&rdp->cblist), + rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags); + rcu_segcblist_init(&rdp->cblist); + rcu_segcblist_disable(&rdp->cblist); return true; } @@ -2459,7 +2448,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, return false; } -static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, +static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, struct rcu_data *rdp, unsigned long flags) { -- cgit v1.2.3 From bedbb648efe178c8e4708ada272536744d0f8118 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 6 Jun 2017 15:49:39 -0700 Subject: rcu: Add TPS() to event-traced strings Strings used in event tracing need to be specially handled, for example, using the TPS() macro. Without the TPS() macro, although output looks fine from within a running kernel, extracting traces from a crash dump produces garbage instead of strings. This commit therefore adds the TPS() macro to some unadorned strings that were passed to event-tracing macros. Signed-off-by: Paul E. McKenney Acked-by: Steven Rostedt (VMware) --- kernel/rcu/tree_plugin.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index bb9e6e43130f..14ba496a13cd 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2073,7 +2073,7 @@ wait_again: /* Wait for callbacks to appear. */ if (!rcu_nocb_poll) { - trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep")); swait_event_interruptible(my_rdp->nocb_wq, !READ_ONCE(my_rdp->nocb_leader_sleep)); raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); @@ -2083,7 +2083,7 @@ wait_again: raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); } else if (firsttime) { firsttime = false; /* Don't drown trace log with "Poll"! */ - trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll"); + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Poll")); } /* @@ -2111,7 +2111,7 @@ wait_again: schedule_timeout_interruptible(1); } else { trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, - "WokeEmpty"); + TPS("WokeEmpty")); } goto wait_again; } @@ -2155,7 +2155,7 @@ wait_again: static void nocb_follower_wait(struct rcu_data *rdp) { for (;;) { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "FollowerSleep"); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep")); swait_event_interruptible(rdp->nocb_wq, READ_ONCE(rdp->nocb_follower_head)); if (smp_load_acquire(&rdp->nocb_follower_head)) { @@ -2163,7 +2163,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) return; } WARN_ON(signal_pending(current)); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeEmpty"); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeEmpty")); } } @@ -2198,7 +2198,7 @@ static int rcu_nocb_kthread(void *arg) rdp->nocb_follower_tail = &rdp->nocb_follower_head; raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); BUG_ON(!list); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeNonEmpty")); /* Each pass through the following loop invokes a callback. */ trace_rcu_batch_start(rdp->rsp->name, -- cgit v1.2.3 From c5ebe66ce774126b888617cab658f6556d23365e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Jun 2017 10:32:23 -0700 Subject: rcu: Add event tracing to ->gp_tasks update at GP start There is currently event tracing to track when a task is preempted within a preemptible RCU read-side critical section, and also when that task subsequently reaches its outermost rcu_read_unlock(), but none indicating when a new grace period starts when that grace period must wait on pre-existing readers that have been been preempted at least once since the beginning of their current RCU read-side critical sections. This commit therefore adds an event trace at grace-period start in the case where there are such readers. Note that only the first reader in the list is traced. Signed-off-by: Paul E. McKenney Acked-by: Steven Rostedt (VMware) --- kernel/rcu/tree_plugin.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 14ba496a13cd..3e3f92e981a1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -636,10 +636,17 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { + struct task_struct *t; + RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); - if (rcu_preempt_has_tasks(rnp)) + if (rcu_preempt_has_tasks(rnp)) { rnp->gp_tasks = rnp->blkd_tasks.next; + t = container_of(rnp->gp_tasks, struct task_struct, + rcu_node_entry); + trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"), + rnp->gpnum, t->pid); + } WARN_ON_ONCE(rnp->qsmask); } -- cgit v1.2.3 From 2dee9404fa8c4384453a5f3a15ad74ab9480f2d6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 11 Jul 2017 21:52:31 -0700 Subject: rcu: Add assertions verifying blocked-tasks list This commit adds assertions verifying the consistency of the rcu_node structure's ->blkd_tasks list and its ->gp_tasks, ->exp_tasks, and ->boost_tasks pointers. In particular, the ->blkd_tasks lists must be empty except for leaf rcu_node structures. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 ++ kernel/rcu/tree_plugin.h | 11 +++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2b37f1a8e235..ac2617d857a3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2410,6 +2410,8 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, return; } WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ + WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1 && + rcu_preempt_blocked_readers_cgp(rnp)); rnp->qsmask &= ~mask; trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, mask, rnp->qsmask, rnp->level, diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3e3f92e981a1..eadf8b95b5e9 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -180,6 +180,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) struct task_struct *t = current; lockdep_assert_held(&rnp->lock); + WARN_ON_ONCE(rdp->mynode != rnp); + WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); /* * Decide where to queue the newly blocked task. In theory, @@ -261,6 +263,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) rnp->gp_tasks = &t->rcu_node_entry; if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) rnp->exp_tasks = &t->rcu_node_entry; + WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) != + !(rnp->qsmask & rdp->grpmask)); + WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) != + !(rnp->expmask & rdp->grpmask)); raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */ /* @@ -482,6 +488,7 @@ void rcu_read_unlock_special(struct task_struct *t) rnp = t->rcu_blocked_node; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ WARN_ON_ONCE(rnp != t->rcu_blocked_node); + WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ @@ -495,10 +502,10 @@ void rcu_read_unlock_special(struct task_struct *t) if (&t->rcu_node_entry == rnp->exp_tasks) rnp->exp_tasks = np; if (IS_ENABLED(CONFIG_RCU_BOOST)) { - if (&t->rcu_node_entry == rnp->boost_tasks) - rnp->boost_tasks = np; /* Snapshot ->boost_mtx ownership w/rnp->lock held. */ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; + if (&t->rcu_node_entry == rnp->boost_tasks) + rnp->boost_tasks = np; } /* -- cgit v1.2.3 From 9b130ad5bb8255ee8534d92d67e12b2a4887eacb Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 8 Sep 2017 16:14:18 -0700 Subject: treewide: make "nr_cpu_ids" unsigned First, number of CPUs can't be negative number. Second, different signnnedness leads to suboptimal code in the following cases: 1) kmalloc(nr_cpu_ids * sizeof(X)); "int" has to be sign extended to size_t. 2) while (loff_t *pos < nr_cpu_ids) MOVSXD is 1 byte longed than the same MOV. Other cases exist as well. Basically compiler is told that nr_cpu_ids can't be negative which can't be deduced if it is "int". Code savings on allyesconfig kernel: -3KB add/remove: 0/0 grow/shrink: 25/264 up/down: 261/-3631 (-3370) function old new delta coretemp_cpu_online 450 512 +62 rcu_init_one 1234 1272 +38 pci_device_probe 374 399 +25 ... pgdat_reclaimable_pages 628 556 -72 select_fallback_rq 446 369 -77 task_numa_find_cpu 1923 1807 -116 Link: http://lkml.kernel.org/r/20170819114959.GA30580@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/kernel/smp.c | 2 +- arch/powerpc/kernel/paca.c | 2 +- arch/powerpc/kernel/setup-common.c | 2 +- arch/powerpc/sysdev/xive/native.c | 4 ++-- arch/tile/kernel/setup.c | 2 +- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/setup_percpu.c | 2 +- arch/x86/kernel/smpboot.c | 2 +- drivers/base/cpu.c | 4 ++-- drivers/scsi/scsi_debug.c | 2 +- include/linux/cpumask.h | 6 +++--- kernel/rcu/tree.c | 2 +- kernel/rcu/tree_plugin.h | 2 +- kernel/sched/topology.c | 2 +- kernel/smp.c | 2 +- kernel/trace/trace_functions_graph.c | 2 +- mm/slub.c | 2 +- 17 files changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index ffe089942ac4..9f7195a5773e 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -690,7 +690,7 @@ void __init smp_init_cpus(void) acpi_parse_gic_cpu_interface, 0); if (cpu_count > nr_cpu_ids) - pr_warn("Number of cores (%d) exceeds configured maximum of %d - clipping\n", + pr_warn("Number of cores (%d) exceeds configured maximum of %u - clipping\n", cpu_count, nr_cpu_ids); if (!bootcpu_valid) { diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index 70f073d6c3b2..2ff2b8a19f71 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -224,7 +224,7 @@ void __init allocate_pacas(void) paca = __va(memblock_alloc_base(paca_size, PAGE_SIZE, limit)); memset(paca, 0, paca_size); - printk(KERN_DEBUG "Allocated %u bytes for %d pacas at %p\n", + printk(KERN_DEBUG "Allocated %u bytes for %u pacas at %p\n", paca_size, nr_cpu_ids, paca); allocate_lppacas(nr_cpu_ids, limit); diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 7de73589d8e2..0ac741fae90e 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -551,7 +551,7 @@ void __init smp_setup_cpu_maps(void) if (maxcpus > nr_cpu_ids) { printk(KERN_WARNING "Partition configured for %d cpus, " - "operating system maximum is %d.\n", + "operating system maximum is %u.\n", maxcpus, nr_cpu_ids); maxcpus = nr_cpu_ids; } else diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index 44f3a25ca630..ebc244b08d67 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -511,13 +511,13 @@ static bool xive_parse_provisioning(struct device_node *np) static void xive_native_setup_pools(void) { /* Allocate a pool big enough */ - pr_debug("XIVE: Allocating VP block for pool size %d\n", nr_cpu_ids); + pr_debug("XIVE: Allocating VP block for pool size %u\n", nr_cpu_ids); xive_pool_vps = xive_native_alloc_vp_block(nr_cpu_ids); if (WARN_ON(xive_pool_vps == XIVE_INVALID_VP)) pr_err("XIVE: Failed to allocate pool VP, KVM might not function\n"); - pr_debug("XIVE: Pool VPs allocated at 0x%x for %d max CPUs\n", + pr_debug("XIVE: Pool VPs allocated at 0x%x for %u max CPUs\n", xive_pool_vps, nr_cpu_ids); } diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c index 443a70bccc1c..6becb96c60a0 100644 --- a/arch/tile/kernel/setup.c +++ b/arch/tile/kernel/setup.c @@ -1200,7 +1200,7 @@ static void __init validate_hv(void) * We use a struct cpumask for this, so it must be big enough. */ if ((smp_height * smp_width) > nr_cpu_ids) - early_panic("Hypervisor %d x %d grid too big for Linux NR_CPUS %d\n", + early_panic("Hypervisor %d x %d grid too big for Linux NR_CPUS %u\n", smp_height, smp_width, nr_cpu_ids); #endif diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 7834f73efbf1..8315e2f517a7 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2097,7 +2097,7 @@ static int allocate_logical_cpuid(int apicid) /* Allocate a new cpuid. */ if (nr_logical_cpuids >= nr_cpu_ids) { - WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %i reached. " + WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %u reached. " "Processor %d/0x%x and the rest are ignored.\n", nr_cpu_ids, nr_logical_cpuids, apicid); return -EINVAL; diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 6e8fcb6f7e1e..28dafed6c682 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -168,7 +168,7 @@ void __init setup_per_cpu_areas(void) unsigned long delta; int rc; - pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", + pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); /* diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 54b9e89d4d6b..cd6622c3204e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1461,7 +1461,7 @@ __init void prefill_possible_map(void) /* nr_cpu_ids could be reduced via nr_cpus= */ if (possible > nr_cpu_ids) { - pr_warn("%d Processors exceeds NR_CPUS limit of %d\n", + pr_warn("%d Processors exceeds NR_CPUS limit of %u\n", possible, nr_cpu_ids); possible = nr_cpu_ids; } diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 2c3b359b3536..321cd7b4d817 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -256,9 +256,9 @@ static ssize_t print_cpus_offline(struct device *dev, buf[n++] = ','; if (nr_cpu_ids == total_cpus-1) - n += snprintf(&buf[n], len - n, "%d", nr_cpu_ids); + n += snprintf(&buf[n], len - n, "%u", nr_cpu_ids); else - n += snprintf(&buf[n], len - n, "%d-%d", + n += snprintf(&buf[n], len - n, "%u-%d", nr_cpu_ids, total_cpus-1); } diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 77a0335eb757..09ba494f8896 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -5465,7 +5465,7 @@ static int sdebug_driver_probe(struct device * dev) return error; } if (submit_queues > nr_cpu_ids) { - pr_warn("%s: trim submit_queues (was %d) to nr_cpu_ids=%d\n", + pr_warn("%s: trim submit_queues (was %d) to nr_cpu_ids=%u\n", my_name, submit_queues, nr_cpu_ids); submit_queues = nr_cpu_ids; } diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 4bf4479a3a80..68c5a8290275 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -32,15 +32,15 @@ typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; #define cpumask_pr_args(maskp) nr_cpu_ids, cpumask_bits(maskp) #if NR_CPUS == 1 -#define nr_cpu_ids 1 +#define nr_cpu_ids 1U #else -extern int nr_cpu_ids; +extern unsigned int nr_cpu_ids; #endif #ifdef CONFIG_CPUMASK_OFFSTACK /* Assuming NR_CPUS is huge, a runtime limit is more efficient. Also, * not all bits may be allocated. */ -#define nr_cpumask_bits ((unsigned int)nr_cpu_ids) +#define nr_cpumask_bits nr_cpu_ids #else #define nr_cpumask_bits ((unsigned int)NR_CPUS) #endif diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 84fe96641b2e..1250e4bd4b85 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4091,7 +4091,7 @@ static void __init rcu_init_geometry(void) if (rcu_fanout_leaf == RCU_FANOUT_LEAF && nr_cpu_ids == NR_CPUS) return; - pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n", + pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n", rcu_fanout_leaf, nr_cpu_ids); /* diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 55bde94b9572..e012b9be777e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -89,7 +89,7 @@ static void __init rcu_bootup_announce_oddness(void) if (rcu_fanout_leaf != RCU_FANOUT_LEAF) pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) - pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); + pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%u.\n", NR_CPUS, nr_cpu_ids); #ifdef CONFIG_RCU_BOOST pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY); #endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6f7b43982f73..5d0062cc10cb 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -473,7 +473,7 @@ static int __init isolated_cpu_setup(char *str) alloc_bootmem_cpumask_var(&cpu_isolated_map); ret = cpulist_parse(str, cpu_isolated_map); if (ret) { - pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); + pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids); return 0; } return 1; diff --git a/kernel/smp.c b/kernel/smp.c index 81cfca9b4cc3..c94dd85c8d41 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -550,7 +550,7 @@ static int __init maxcpus(char *str) early_param("maxcpus", maxcpus); /* Setup number of possible processor ids */ -int nr_cpu_ids __read_mostly = NR_CPUS; +unsigned int nr_cpu_ids __read_mostly = NR_CPUS; EXPORT_SYMBOL(nr_cpu_ids); /* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d56123cdcc89..b8f1f54731af 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1543,7 +1543,7 @@ fs_initcall(init_graph_tracefs); static __init int init_graph_trace(void) { - max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); + max_bytes_for_cpu = snprintf(NULL, 0, "%u", nr_cpu_ids - 1); if (!register_trace_event(&graph_trace_entry_event)) { pr_warn("Warning: could not register graph trace events\n"); diff --git a/mm/slub.c b/mm/slub.c index ddb04576b342..d39a5d3834b3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4232,7 +4232,7 @@ void __init kmem_cache_init(void) cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL, slub_cpu_dead); - pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n", + pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%u, Nodes=%d\n", cache_line_size(), slub_min_order, slub_max_order, slub_min_objects, nr_cpu_ids, nr_node_ids); -- cgit v1.2.3 From 135bd1a230bb69a68c9808a7d25467318900b80a Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Mon, 7 Aug 2017 11:20:10 +0530 Subject: rcu: Fix up pending cbs check in rcu_prepare_for_idle The pending-callbacks check in rcu_prepare_for_idle() is backwards. It should accelerate if there are pending callbacks, but the check rather uselessly accelerates only if there are no callbacks. This commit therefore inverts this check. Fixes: 15fecf89e46a ("srcu: Abstract multi-tail callback list handling") Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney Cc: # 4.12.x --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e012b9be777e..fed95fa941e6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1507,7 +1507,7 @@ static void rcu_prepare_for_idle(void) rdtp->last_accelerate = jiffies; for_each_rcu_flavor(rsp) { rdp = this_cpu_ptr(rsp->rda); - if (rcu_segcblist_pend_cbs(&rdp->cblist)) + if (!rcu_segcblist_pend_cbs(&rdp->cblist)) continue; rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ -- cgit v1.2.3 From 9b9500da81502738efa1b485a8835f174ff7be6d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 Aug 2017 17:05:59 -0700 Subject: rcu: Make RCU CPU stall warnings check for irq-disabled CPUs One common question upon seeing an RCU CPU stall warning is "did the stalled CPUs have interrupts disabled?" However, the current stall warnings are silent on this point. This commit therefore uses irq_work to check whether stalled CPUs still respond to IPIs, and flags this state in the RCU CPU stall warning console messages. Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 104 +++++++++++++++++++++++++++++++++++++++++------ kernel/rcu/tree.h | 5 +++ kernel/rcu/tree_plugin.h | 7 +++- 3 files changed, 103 insertions(+), 13 deletions(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0dda57a28276..12838a9a128e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1206,6 +1206,22 @@ static int rcu_is_cpu_rrupt_from_idle(void) return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1; } +/* + * We are reporting a quiescent state on behalf of some other CPU, so + * it is our responsibility to check for and handle potential overflow + * of the rcu_node ->gpnum counter with respect to the rcu_data counters. + * After all, the CPU might be in deep idle state, and thus executing no + * code whatsoever. + */ +static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) +{ + lockdep_assert_held(&rnp->lock); + if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum)) + WRITE_ONCE(rdp->gpwrap, true); + if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum)) + rdp->rcu_iw_gpnum = rnp->gpnum + ULONG_MAX / 4; +} + /* * Snapshot the specified CPU's dynticks counter so that we can later * credit them with an implicit quiescent state. Return 1 if this CPU @@ -1216,14 +1232,33 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks); if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); - if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, - rdp->mynode->gpnum)) - WRITE_ONCE(rdp->gpwrap, true); + rcu_gpnum_ovf(rdp->mynode, rdp); return 1; } return 0; } +/* + * Handler for the irq_work request posted when a grace period has + * gone on for too long, but not yet long enough for an RCU CPU + * stall warning. Set state appropriately, but just complain if + * there is unexpected state on entry. + */ +static void rcu_iw_handler(struct irq_work *iwp) +{ + struct rcu_data *rdp; + struct rcu_node *rnp; + + rdp = container_of(iwp, struct rcu_data, rcu_iw); + rnp = rdp->mynode; + raw_spin_lock_rcu_node(rnp); + if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) { + rdp->rcu_iw_gpnum = rnp->gpnum; + rdp->rcu_iw_pending = false; + } + raw_spin_unlock_rcu_node(rnp); +} + /* * Return true if the specified CPU has passed through a quiescent * state by virtue of being in or having passed through an dynticks @@ -1235,7 +1270,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) unsigned long jtsq; bool *rnhqp; bool *ruqp; - struct rcu_node *rnp; + struct rcu_node *rnp = rdp->mynode; /* * If the CPU passed through or entered a dynticks idle phase with @@ -1248,6 +1283,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); rdp->dynticks_fqs++; + rcu_gpnum_ovf(rnp, rdp); return 1; } @@ -1258,12 +1294,12 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * might not be the case for nohz_full CPUs looping in the kernel. */ jtsq = jiffies_till_sched_qs; - rnp = rdp->mynode; ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) && READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); + rcu_gpnum_ovf(rnp, rdp); return 1; } else if (time_after(jiffies, rdp->rsp->gp_start + jtsq)) { /* Load rcu_qs_ctr before store to rcu_urgent_qs. */ @@ -1274,6 +1310,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp))) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl")); rdp->offline_fqs++; + rcu_gpnum_ovf(rnp, rdp); return 1; } @@ -1305,11 +1342,22 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) } /* - * If more than halfway to RCU CPU stall-warning time, do - * a resched_cpu() to try to loosen things up a bit. + * If more than halfway to RCU CPU stall-warning time, do a + * resched_cpu() to try to loosen things up a bit. Also check to + * see if the CPU is getting hammered with interrupts, but only + * once per grace period, just to keep the IPIs down to a dull roar. */ - if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2) + if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2) { resched_cpu(rdp->cpu); + if (IS_ENABLED(CONFIG_IRQ_WORK) && + !rdp->rcu_iw_pending && rdp->rcu_iw_gpnum != rnp->gpnum && + (rnp->ffmask & rdp->grpmask)) { + init_irq_work(&rdp->rcu_iw, rcu_iw_handler); + rdp->rcu_iw_pending = true; + rdp->rcu_iw_gpnum = rnp->gpnum; + irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); + } + } return 0; } @@ -1498,6 +1546,7 @@ static void print_cpu_stall(struct rcu_state *rsp) { int cpu; unsigned long flags; + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; @@ -1513,7 +1562,9 @@ static void print_cpu_stall(struct rcu_state *rsp) */ pr_err("INFO: %s self-detected stall on CPU", rsp->name); print_cpu_stall_info_begin(); + raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); print_cpu_stall_info(rsp, smp_processor_id()); + raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); print_cpu_stall_info_end(); for_each_possible_cpu(cpu) totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, @@ -1907,6 +1958,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, rdp->core_needs_qs = need_gp; zero_cpu_stall_ticks(rdp); WRITE_ONCE(rdp->gpwrap, false); + rcu_gpnum_ovf(rnp, rdp); } return ret; } @@ -3685,6 +3737,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->cpu_no_qs.b.norm = true; rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu); rdp->core_needs_qs = false; + rdp->rcu_iw_pending = false; + rdp->rcu_iw_gpnum = rnp->gpnum - 1; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -3722,10 +3776,24 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing) */ int rcutree_online_cpu(unsigned int cpu) { - sync_sched_exp_online_cleanup(cpu); - rcutree_affinity_setting(cpu, -1); + unsigned long flags; + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) { + rdp = per_cpu_ptr(rsp->rda, cpu); + rnp = rdp->mynode; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rnp->ffmask |= rdp->grpmask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } if (IS_ENABLED(CONFIG_TREE_SRCU)) srcu_online_cpu(cpu); + if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) + return 0; /* Too early in boot for scheduler work. */ + sync_sched_exp_online_cleanup(cpu); + rcutree_affinity_setting(cpu, -1); return 0; } @@ -3735,6 +3803,19 @@ int rcutree_online_cpu(unsigned int cpu) */ int rcutree_offline_cpu(unsigned int cpu) { + unsigned long flags; + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) { + rdp = per_cpu_ptr(rsp->rda, cpu); + rnp = rdp->mynode; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rnp->ffmask &= ~rdp->grpmask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + rcutree_affinity_setting(cpu, cpu); if (IS_ENABLED(CONFIG_TREE_SRCU)) srcu_offline_cpu(cpu); @@ -4183,8 +4264,7 @@ void __init rcu_init(void) for_each_online_cpu(cpu) { rcutree_prepare_cpu(cpu); rcu_cpu_starting(cpu); - if (IS_ENABLED(CONFIG_TREE_SRCU)) - srcu_online_cpu(cpu); + rcutree_online_cpu(cpu); } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8e1f285f0a70..46a5d1991450 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -103,6 +103,7 @@ struct rcu_node { /* Online CPUs for next expedited GP. */ /* Any CPU that has ever been online will */ /* have its bit set. */ + unsigned long ffmask; /* Fully functional CPUs. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ int grplo; /* lowest-numbered CPU or group here. */ @@ -285,6 +286,10 @@ struct rcu_data { /* 8) RCU CPU stall data. */ unsigned int softirq_snap; /* Snapshot of softirq activity. */ + /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ + struct irq_work rcu_iw; /* Check for non-irq activity. */ + bool rcu_iw_pending; /* Is ->rcu_iw pending? */ + unsigned long rcu_iw_gpnum; /* ->gpnum associated with ->rcu_iw. */ int cpu; struct rcu_state *rsp; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e012b9be777e..14977d0470d1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1671,6 +1671,7 @@ static void print_cpu_stall_info_begin(void) */ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) { + unsigned long delta; char fast_no_hz[72]; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_dynticks *rdtp = rdp->dynticks; @@ -1685,11 +1686,15 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - pr_err("\t%d-%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", + delta = rdp->mynode->gpnum - rdp->rcu_iw_gpnum; + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], + !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' : + rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : + "!."[!delta], ticks_value, ticks_title, rcu_dynticks_snap(rdtp) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, -- cgit v1.2.3 From b88697810d7c1d102a529990f9071b0f14cfe6df Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 18 Oct 2017 08:33:44 -0700 Subject: rcu: Do not include rtmutex_common.h unconditionally This commit adjusts include files and provides definitions in preparation for suppressing lockdep false-positive ->boost_mtx complaints. Without this preparation, architectures not supporting rt_mutex will get build failures. Reported-by: kbuild test robot Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fed95fa941e6..969eae45f05d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -54,6 +54,7 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work); * This probably needs to be excluded from -rt builds. */ #define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; }) +#define rt_mutex_futex_unlock(x) WARN_ON_ONCE(1) #endif /* #else #ifdef CONFIG_RCU_BOOST */ @@ -911,8 +912,6 @@ void exit_rcu(void) #ifdef CONFIG_RCU_BOOST -#include "../locking/rtmutex_common.h" - static void rcu_wake_cond(struct task_struct *t, int status) { /* -- cgit v1.2.3 From 02a7c234e54052101164368ff981bd72f7acdd65 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Sep 2017 15:36:42 -0700 Subject: rcu: Suppress lockdep false-positive ->boost_mtx complaints RCU priority boosting uses rt_mutex_init_proxy_locked() to initialize an rt_mutex structure in locked state held by some other task. When that other task releases it, lockdep complains (quite accurately, but a bit uselessly) that the other task never acquired it. This complaint can suppress other, more helpful, lockdep complaints, and in any case it is a false positive. This commit therefore switches from rt_mutex_unlock() to rt_mutex_futex_unlock(), thereby avoiding the lockdep annotations. Of course, if lockdep ever learns about rt_mutex_init_proxy_locked(), addtional adjustments will be required. Suggested-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 969eae45f05d..1eaab96d1a3c 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -531,7 +531,7 @@ void rcu_read_unlock_special(struct task_struct *t) /* Unboost if we were boosted. */ if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex) - rt_mutex_unlock(&rnp->boost_mtx); + rt_mutex_futex_unlock(&rnp->boost_mtx); /* * If this was the last task on the expedited lists, -- cgit v1.2.3 From 7863406143d8bbbbda07a61285c5f4c217908dfd Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Oct 2017 04:42:28 +0200 Subject: sched/isolation: Move housekeeping related code to its own file The housekeeping code is currently tied to the NOHZ code. As we are planning to make housekeeping independent from it, start with moving the relevant code to its own file. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Acked-by: Paul E. McKenney Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1509072159-31808-2-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- drivers/net/ethernet/tile/tilegx.c | 2 +- include/linux/sched/isolation.h | 56 ++++++++++++++++++++++++++++++++++++++ include/linux/tick.h | 37 ------------------------- init/main.c | 2 ++ kernel/rcu/tree_plugin.h | 1 + kernel/rcu/update.c | 1 + kernel/sched/Makefile | 1 + kernel/sched/core.c | 1 + kernel/sched/fair.c | 1 + kernel/sched/isolation.c | 33 ++++++++++++++++++++++ kernel/time/tick-sched.c | 18 ------------ kernel/watchdog.c | 1 + 12 files changed, 98 insertions(+), 56 deletions(-) create mode 100644 include/linux/sched/isolation.h create mode 100644 kernel/sched/isolation.c (limited to 'kernel/rcu/tree_plugin.h') diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c index c00102b8145a..27a3272dcd48 100644 --- a/drivers/net/ethernet/tile/tilegx.c +++ b/drivers/net/ethernet/tile/tilegx.c @@ -40,7 +40,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h new file mode 100644 index 000000000000..b7cfbc46286c --- /dev/null +++ b/include/linux/sched/isolation.h @@ -0,0 +1,56 @@ +#ifndef _LINUX_SCHED_ISOLATION_H +#define _LINUX_SCHED_ISOLATION_H + +#include +#include +#include + +#ifdef CONFIG_NO_HZ_FULL +extern cpumask_var_t housekeeping_mask; + +static inline int housekeeping_any_cpu(void) +{ + return cpumask_any_and(housekeeping_mask, cpu_online_mask); +} + +extern void __init housekeeping_init(void); + +#else + +static inline int housekeeping_any_cpu(void) +{ + return smp_processor_id(); +} + +static inline void housekeeping_init(void) { } +#endif /* CONFIG_NO_HZ_FULL */ + + +static inline const struct cpumask *housekeeping_cpumask(void) +{ +#ifdef CONFIG_NO_HZ_FULL + if (tick_nohz_full_enabled()) + return housekeeping_mask; +#endif + return cpu_possible_mask; +} + +static inline bool is_housekeeping_cpu(int cpu) +{ +#ifdef CONFIG_NO_HZ_FULL + if (tick_nohz_full_enabled()) + return cpumask_test_cpu(cpu, housekeeping_mask); +#endif + return true; +} + +static inline void housekeeping_affine(struct task_struct *t) +{ +#ifdef CONFIG_NO_HZ_FULL + if (tick_nohz_full_enabled()) + set_cpus_allowed_ptr(t, housekeeping_mask); + +#endif +} + +#endif /* _LINUX_SCHED_ISOLATION_H */ diff --git a/include/linux/tick.h b/include/linux/tick.h index fe01e68bf520..68afc09aa8ac 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -137,7 +137,6 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } #ifdef CONFIG_NO_HZ_FULL extern bool tick_nohz_full_running; extern cpumask_var_t tick_nohz_full_mask; -extern cpumask_var_t housekeeping_mask; static inline bool tick_nohz_full_enabled(void) { @@ -161,11 +160,6 @@ static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) cpumask_or(mask, mask, tick_nohz_full_mask); } -static inline int housekeeping_any_cpu(void) -{ - return cpumask_any_and(housekeeping_mask, cpu_online_mask); -} - extern void tick_nohz_dep_set(enum tick_dep_bits bit); extern void tick_nohz_dep_clear(enum tick_dep_bits bit); extern void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit); @@ -235,10 +229,6 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal, extern void tick_nohz_full_kick_cpu(int cpu); extern void __tick_nohz_task_switch(void); #else -static inline int housekeeping_any_cpu(void) -{ - return smp_processor_id(); -} static inline bool tick_nohz_full_enabled(void) { return false; } static inline bool tick_nohz_full_cpu(int cpu) { return false; } static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { } @@ -260,33 +250,6 @@ static inline void tick_nohz_full_kick_cpu(int cpu) { } static inline void __tick_nohz_task_switch(void) { } #endif -static inline const struct cpumask *housekeeping_cpumask(void) -{ -#ifdef CONFIG_NO_HZ_FULL - if (tick_nohz_full_enabled()) - return housekeeping_mask; -#endif - return cpu_possible_mask; -} - -static inline bool is_housekeeping_cpu(int cpu) -{ -#ifdef CONFIG_NO_HZ_FULL - if (tick_nohz_full_enabled()) - return cpumask_test_cpu(cpu, housekeeping_mask); -#endif - return true; -} - -static inline void housekeeping_affine(struct task_struct *t) -{ -#ifdef CONFIG_NO_HZ_FULL - if (tick_nohz_full_enabled()) - set_cpus_allowed_ptr(t, housekeeping_mask); - -#endif -} - static inline void tick_nohz_task_switch(void) { if (tick_nohz_full_enabled()) diff --git a/init/main.c b/init/main.c index 0ee9c6866ada..4610c99ae306 100644 --- a/init/main.c +++ b/init/main.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -606,6 +607,7 @@ asmlinkage __visible void __init start_kernel(void) early_irq_init(); init_IRQ(); tick_init(); + housekeeping_init(); rcu_init_nohz(); init_timers(); hrtimers_init(); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e012b9be777e..c7632f51c5a0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -29,6 +29,7 @@ #include #include #include +#include #include #include "../time/tick-internal.h" diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 5033b66d2753..79abeb0ca329 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -51,6 +51,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 78f54932ea1d..871d43d73375 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -26,3 +26,4 @@ obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o +obj-$(CONFIG_NO_HZ_FULL) += isolation.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2288a145bf01..ad188acb7636 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 56f343b8e749..591481db8c6a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -32,6 +32,7 @@ #include #include #include +#include #include diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c new file mode 100644 index 000000000000..3589252ed476 --- /dev/null +++ b/kernel/sched/isolation.c @@ -0,0 +1,33 @@ +/* + * Housekeeping management. Manage the targets for routine code that can run on + * any CPU: unbound workqueues, timers, kthreads and any offloadable work. + * + * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker + * + */ + +#include +#include +#include +#include + +cpumask_var_t housekeeping_mask; + +void __init housekeeping_init(void) +{ + if (!tick_nohz_full_enabled()) + return; + + if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { + WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n"); + cpumask_clear(tick_nohz_full_mask); + tick_nohz_full_running = false; + return; + } + + cpumask_andnot(housekeeping_mask, + cpu_possible_mask, tick_nohz_full_mask); + + /* We need at least one CPU to handle housekeeping work */ + WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); +} diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7b258c59d78a..27d7d522ac4e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -166,7 +166,6 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; -cpumask_var_t housekeeping_mask; bool tick_nohz_full_running; static atomic_t tick_dep_mask; @@ -438,13 +437,6 @@ void __init tick_nohz_init(void) return; } - if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { - WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n"); - cpumask_clear(tick_nohz_full_mask); - tick_nohz_full_running = false; - return; - } - /* * Full dynticks uses irq work to drive the tick rescheduling on safe * locking contexts. But then we need irq work to raise its own @@ -453,7 +445,6 @@ void __init tick_nohz_init(void) if (!arch_irq_work_has_interrupt()) { pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n"); cpumask_clear(tick_nohz_full_mask); - cpumask_copy(housekeeping_mask, cpu_possible_mask); tick_nohz_full_running = false; return; } @@ -466,9 +457,6 @@ void __init tick_nohz_init(void) cpumask_clear_cpu(cpu, tick_nohz_full_mask); } - cpumask_andnot(housekeeping_mask, - cpu_possible_mask, tick_nohz_full_mask); - for_each_cpu(cpu, tick_nohz_full_mask) context_tracking_cpu_set(cpu); @@ -478,12 +466,6 @@ void __init tick_nohz_init(void) WARN_ON(ret < 0); pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", cpumask_pr_args(tick_nohz_full_mask)); - - /* - * We need at least one CPU to handle housekeeping work such - * as timekeeping, unbound timers, workqueues, ... - */ - WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); } #endif diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 6bcb854909c0..3c44dbaa4b9f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From de201559df872f83d0c08fb4effe3efd28e6cbc8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Oct 2017 04:42:35 +0200 Subject: sched/isolation: Introduce housekeeping flags Before we implement isolcpus under housekeeping, we need the isolation features to be more finegrained. For example some people want NOHZ_FULL without the full scheduler isolation, others want full scheduler isolation without NOHZ_FULL. So let's cut all these isolation features piecewise, at the risk of overcutting it right now. We can still merge some flags later if they always make sense together. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1509072159-31808-9-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- drivers/net/ethernet/tile/tilegx.c | 4 ++-- include/linux/sched/isolation.h | 26 +++++++++++++++++--------- kernel/rcu/tree_plugin.h | 2 +- kernel/rcu/update.c | 2 +- kernel/sched/core.c | 8 ++++---- kernel/sched/fair.c | 2 +- kernel/sched/isolation.c | 26 +++++++++++++++----------- kernel/watchdog.c | 3 ++- 8 files changed, 43 insertions(+), 30 deletions(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c index 27a3272dcd48..b3e5816a4678 100644 --- a/drivers/net/ethernet/tile/tilegx.c +++ b/drivers/net/ethernet/tile/tilegx.c @@ -2270,8 +2270,8 @@ static int __init tile_net_init_module(void) tile_net_dev_init(name, mac); if (!network_cpus_init()) - cpumask_and(&network_cpus_map, housekeeping_cpumask(), - cpu_online_mask); + cpumask_and(&network_cpus_map, + housekeeping_cpumask(HK_FLAG_MISC), cpu_online_mask); return 0; } diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index 93ac2367a520..9bb753eece3b 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -5,35 +5,43 @@ #include #include +enum hk_flags { + HK_FLAG_TIMER = 1, + HK_FLAG_RCU = (1 << 1), + HK_FLAG_MISC = (1 << 2), + HK_FLAG_SCHED = (1 << 3), +}; + #ifdef CONFIG_CPU_ISOLATION DECLARE_STATIC_KEY_FALSE(housekeeping_overriden); -extern int housekeeping_any_cpu(void); -extern const struct cpumask *housekeeping_cpumask(void); -extern void housekeeping_affine(struct task_struct *t); -extern bool housekeeping_test_cpu(int cpu); +extern int housekeeping_any_cpu(enum hk_flags flags); +extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags); +extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags); +extern bool housekeeping_test_cpu(int cpu, enum hk_flags flags); extern void __init housekeeping_init(void); #else -static inline int housekeeping_any_cpu(void) +static inline int housekeeping_any_cpu(enum hk_flags flags) { return smp_processor_id(); } -static inline const struct cpumask *housekeeping_cpumask(void) +static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags) { return cpu_possible_mask; } -static inline void housekeeping_affine(struct task_struct *t) { } +static inline void housekeeping_affine(struct task_struct *t, + enum hk_flags flags) { } static inline void housekeeping_init(void) { } #endif /* CONFIG_CPU_ISOLATION */ -static inline bool housekeeping_cpu(int cpu) +static inline bool housekeeping_cpu(int cpu, enum hk_flags flags) { #ifdef CONFIG_CPU_ISOLATION if (static_branch_unlikely(&housekeeping_overriden)) - return housekeeping_test_cpu(cpu); + return housekeeping_test_cpu(cpu, flags); #endif return true; } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c7632f51c5a0..34125d23e58a 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2584,7 +2584,7 @@ static void rcu_bind_gp_kthread(void) if (!tick_nohz_full_enabled()) return; - housekeeping_affine(current); + housekeeping_affine(current, HK_FLAG_RCU); } /* Record the current task on dyntick-idle entry. */ diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 79abeb0ca329..e3e60efaafee 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -719,7 +719,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) LIST_HEAD(rcu_tasks_holdouts); /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ - housekeeping_affine(current); + housekeeping_affine(current, HK_FLAG_RCU); /* * Each pass through the following loop makes one check for diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d0fb448dd43a..2210c0203e51 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -527,7 +527,7 @@ int get_nohz_timer_target(void) int i, cpu = smp_processor_id(); struct sched_domain *sd; - if (!idle_cpu(cpu) && housekeeping_cpu(cpu)) + if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) return cpu; rcu_read_lock(); @@ -536,15 +536,15 @@ int get_nohz_timer_target(void) if (cpu == i) continue; - if (!idle_cpu(i) && housekeeping_cpu(i)) { + if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) { cpu = i; goto unlock; } } } - if (!housekeeping_cpu(cpu)) - cpu = housekeeping_any_cpu(); + if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) + cpu = housekeeping_any_cpu(HK_FLAG_TIMER); unlock: rcu_read_unlock(); return cpu; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cdece8f967f0..f755de86a813 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9027,7 +9027,7 @@ void nohz_balance_enter_idle(int cpu) return; /* Spare idle load balancing on CPUs that don't want to be disturbed: */ - if (!housekeeping_cpu(cpu)) + if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) return; if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index bb8ba19a0235..37a138a780ff 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -15,37 +15,39 @@ DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); EXPORT_SYMBOL_GPL(housekeeping_overriden); static cpumask_var_t housekeeping_mask; +static unsigned int housekeeping_flags; -int housekeeping_any_cpu(void) +int housekeeping_any_cpu(enum hk_flags flags) { if (static_branch_unlikely(&housekeeping_overriden)) - return cpumask_any_and(housekeeping_mask, cpu_online_mask); - + if (housekeeping_flags & flags) + return cpumask_any_and(housekeeping_mask, cpu_online_mask); return smp_processor_id(); } EXPORT_SYMBOL_GPL(housekeeping_any_cpu); -const struct cpumask *housekeeping_cpumask(void) +const struct cpumask *housekeeping_cpumask(enum hk_flags flags) { if (static_branch_unlikely(&housekeeping_overriden)) - return housekeeping_mask; - + if (housekeeping_flags & flags) + return housekeeping_mask; return cpu_possible_mask; } EXPORT_SYMBOL_GPL(housekeeping_cpumask); -void housekeeping_affine(struct task_struct *t) +void housekeeping_affine(struct task_struct *t, enum hk_flags flags) { if (static_branch_unlikely(&housekeeping_overriden)) - set_cpus_allowed_ptr(t, housekeeping_mask); + if (housekeeping_flags & flags) + set_cpus_allowed_ptr(t, housekeeping_mask); } EXPORT_SYMBOL_GPL(housekeeping_affine); -bool housekeeping_test_cpu(int cpu) +bool housekeeping_test_cpu(int cpu, enum hk_flags flags) { if (static_branch_unlikely(&housekeeping_overriden)) - return cpumask_test_cpu(cpu, housekeeping_mask); - + if (housekeeping_flags & flags) + return cpumask_test_cpu(cpu, housekeeping_mask); return true; } EXPORT_SYMBOL_GPL(housekeeping_test_cpu); @@ -65,6 +67,8 @@ void __init housekeeping_init(void) cpumask_andnot(housekeeping_mask, cpu_possible_mask, tick_nohz_full_mask); + housekeeping_flags = HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; + static_branch_enable(&housekeeping_overriden); /* We need at least one CPU to handle housekeeping work */ diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 562652c9c815..e9e2ebb3db29 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -777,7 +777,8 @@ void __init lockup_detector_init(void) if (tick_nohz_full_enabled()) pr_info("Disabling watchdog on nohz_full cores by default\n"); - cpumask_copy(&watchdog_cpumask, housekeeping_cpumask()); + cpumask_copy(&watchdog_cpumask, + housekeeping_cpumask(HK_FLAG_TIMER)); if (!watchdog_nmi_probe()) nmi_watchdog_available = true; -- cgit v1.2.3 From fd30b717b86dc30ffe25596f8de6542a02ae9401 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 22 Oct 2017 17:58:54 -0700 Subject: rcu: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Signed-off-by: Kees Cook --- kernel/rcu/rcutorture.c | 4 ++-- kernel/rcu/tree_plugin.h | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 45f2ffbc1e78..96a3cdaeed91 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1076,7 +1076,7 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp) * counter in the element should never be greater than 1, otherwise, the * RCU implementation is broken. */ -static void rcu_torture_timer(unsigned long unused) +static void rcu_torture_timer(struct timer_list *unused) { int idx; unsigned long started; @@ -1163,7 +1163,7 @@ rcu_torture_reader(void *arg) VERBOSE_TOROUT_STRING("rcu_torture_reader task started"); set_user_nice(current, MAX_NICE); if (irqreader && cur_ops->irq_capable) - setup_timer_on_stack(&t, rcu_torture_timer, 0); + timer_setup_on_stack(&t, rcu_torture_timer, 0); do { if (irqreader && cur_ops->irq_capable) { diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e012b9be777e..e85946d9843b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2261,9 +2261,11 @@ static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp) } /* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ -static void do_nocb_deferred_wakeup_timer(unsigned long x) +static void do_nocb_deferred_wakeup_timer(struct timer_list *t) { - do_nocb_deferred_wakeup_common((struct rcu_data *)x); + struct rcu_data *rdp = from_timer(rdp, t, nocb_timer); + + do_nocb_deferred_wakeup_common(rdp); } /* @@ -2327,8 +2329,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) init_swait_queue_head(&rdp->nocb_wq); rdp->nocb_follower_tail = &rdp->nocb_follower_head; raw_spin_lock_init(&rdp->nocb_lock); - setup_timer(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, - (unsigned long)rdp); + timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); } /* -- cgit v1.2.3 From b04db8e19fc2e9131524dec43057c1b96d5ba3ba Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Nov 2017 16:01:30 +0100 Subject: rcu: Use lockdep to assert IRQs are disabled/enabled Lockdep now has an integrated IRQs disabled/enabled sanity check. Just use it instead of the ad-hoc RCU version. Signed-off-by: Frederic Weisbecker Acked-by: Thomas Gleixner Acked-by: Paul E. McKenney Cc: David S . Miller Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/r/1509980490-4285-15-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/rcu/tree.c | 16 ++++++++-------- kernel/rcu/tree_plugin.h | 10 +++++----- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3e3650e94ae6..08fa586d7f8c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -734,7 +734,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) int idx = (READ_ONCE(rnp->completed) + 1) & 0x1; int *fp = &rnp->need_future_gp[idx]; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_future_needs_gp() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); return READ_ONCE(*fp); } @@ -746,7 +746,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) static bool cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { - RCU_LOCKDEP_WARN(!irqs_disabled(), "cpu_needs_another_gp() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); if (rcu_gp_in_progress(rsp)) return false; /* No, a grace period is already in progress. */ if (rcu_future_needs_gp(rsp)) @@ -773,7 +773,7 @@ static void rcu_eqs_enter_common(bool user) struct rcu_data *rdp; struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_enter_common() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)) { @@ -840,7 +840,7 @@ static void rcu_eqs_enter(bool user) */ void rcu_idle_enter(void) { - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_idle_enter() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); rcu_eqs_enter(false); } @@ -855,7 +855,7 @@ void rcu_idle_enter(void) */ void rcu_user_enter(void) { - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_user_enter() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); rcu_eqs_enter(true); } #endif /* CONFIG_NO_HZ_FULL */ @@ -880,7 +880,7 @@ void rcu_irq_exit(void) { struct rcu_dynticks *rdtp; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); rdtp = this_cpu_ptr(&rcu_dynticks); /* Page faults can happen in NMI handlers, so check... */ @@ -947,7 +947,7 @@ static void rcu_eqs_exit(bool user) struct rcu_dynticks *rdtp; long long oldval; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_exit() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); @@ -1018,7 +1018,7 @@ void rcu_irq_enter(void) struct rcu_dynticks *rdtp; long long oldval; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); rdtp = this_cpu_ptr(&rcu_dynticks); /* Page faults can happen in NMI handlers, so check... */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e012b9be777e..df08e5c8126b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -325,7 +325,7 @@ static void rcu_preempt_note_context_switch(bool preempt) struct rcu_data *rdp; struct rcu_node *rnp; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n"); + lockdep_assert_irqs_disabled(); WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); if (t->rcu_read_lock_nesting > 0 && !t->rcu_read_unlock_special.b.blocked) { @@ -1421,7 +1421,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); unsigned long dj; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_needs_cpu() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); /* Snapshot to detect later posting of non-lazy callback. */ rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; @@ -1470,7 +1470,7 @@ static void rcu_prepare_for_idle(void) struct rcu_state *rsp; int tne; - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_prepare_for_idle() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); if (rcu_is_nocb_cpu(smp_processor_id())) return; @@ -1525,7 +1525,7 @@ static void rcu_prepare_for_idle(void) */ static void rcu_cleanup_after_idle(void) { - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_cleanup_after_idle() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); if (rcu_is_nocb_cpu(smp_processor_id())) return; if (rcu_try_advance_all_cbs()) @@ -2012,7 +2012,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, struct rcu_data *rdp, unsigned long flags) { - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_nocb_adopt_orphan_cbs() invoked with irqs enabled!!!"); + lockdep_assert_irqs_disabled(); if (!rcu_is_nocb_cpu(smp_processor_id())) return false; /* Not NOCBs CPU, caller must migrate CBs. */ __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist), -- cgit v1.2.3