summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt52
-rw-r--r--include/linux/rcupdate.h1
-rw-r--r--include/linux/rcutiny.h21
-rw-r--r--include/linux/rcutree.h2
-rw-r--r--kernel/rcu/Kconfig29
-rw-r--r--kernel/rcu/Kconfig.debug2
-rw-r--r--kernel/rcu/rcu.h15
-rw-r--r--kernel/rcu/rcuscale.c1
-rw-r--r--kernel/rcu/rcutorture.c247
-rw-r--r--kernel/rcu/refscale.c18
-rw-r--r--kernel/rcu/srcutree.c98
-rw-r--r--kernel/rcu/tiny.c25
-rw-r--r--kernel/rcu/tree.c184
-rw-r--r--kernel/rcu/tree.h13
-rw-r--r--kernel/rcu/tree_exp.h113
-rw-r--r--kernel/rcu/tree_nocb.h266
-rw-r--r--kernel/rcu/tree_plugin.h46
-rw-r--r--kernel/rcu/update.c13
-rw-r--r--kernel/smp.c4
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-check-branches.sh11
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-remote.sh1
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm.sh6
22 files changed, 870 insertions, 298 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 2522b11e593f..4cd3ca5d09a8 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3659,6 +3659,9 @@
just as if they had also been called out in the
rcu_nocbs= boot parameter.
+ Note that this argument takes precedence over
+ the CONFIG_RCU_NOCB_CPU_DEFAULT_ALL option.
+
noiotrap [SH] Disables trapped I/O port accesses.
noirqdebug [X86-32] Disables the code which attempts to detect and
@@ -4557,6 +4560,9 @@
no-callback mode from boot but the mode may be
toggled at runtime via cpusets.
+ Note that this argument takes precedence over
+ the CONFIG_RCU_NOCB_CPU_DEFAULT_ALL option.
+
rcu_nocb_poll [KNL]
Rather than requiring that offloaded CPUs
(specified by rcu_nocbs= above) explicitly
@@ -4666,6 +4672,34 @@
When RCU_NOCB_CPU is set, also adjust the
priority of NOCB callback kthreads.
+ rcutree.rcu_divisor= [KNL]
+ Set the shift-right count to use to compute
+ the callback-invocation batch limit bl from
+ the number of callbacks queued on this CPU.
+ The result will be bounded below by the value of
+ the rcutree.blimit kernel parameter. Every bl
+ callbacks, the softirq handler will exit in
+ order to allow the CPU to do other work.
+
+ Please note that this callback-invocation batch
+ limit applies only to non-offloaded callback
+ invocation. Offloaded callbacks are instead
+ invoked in the context of an rcuoc kthread, which
+ scheduler will preempt as it does any other task.
+
+ rcutree.nocb_nobypass_lim_per_jiffy= [KNL]
+ On callback-offloaded (rcu_nocbs) CPUs,
+ RCU reduces the lock contention that would
+ otherwise be caused by callback floods through
+ use of the ->nocb_bypass list. However, in the
+ common non-flooded case, RCU queues directly to
+ the main ->cblist in order to avoid the extra
+ overhead of the ->nocb_bypass list and its lock.
+ But if there are too many callbacks queued during
+ a single jiffy, RCU pre-queues the callbacks into
+ the ->nocb_bypass queue. The definition of "too
+ many" is supplied by this kernel boot parameter.
+
rcutree.rcu_nocb_gp_stride= [KNL]
Set the number of NOCB callback kthreads in
each group, which defaults to the square root
@@ -5771,6 +5805,24 @@
expediting. Set to zero to disable automatic
expediting.
+ srcutree.srcu_max_nodelay [KNL]
+ Specifies the number of no-delay instances
+ per jiffy for which the SRCU grace period
+ worker thread will be rescheduled with zero
+ delay. Beyond this limit, worker thread will
+ be rescheduled with a sleep delay of one jiffy.
+
+ srcutree.srcu_max_nodelay_phase [KNL]
+ Specifies the per-grace-period phase, number of
+ non-sleeping polls of readers. Beyond this limit,
+ grace period worker thread will be rescheduled
+ with a sleep delay of one jiffy, between each
+ rescan of the readers, for a grace period phase.
+
+ srcutree.srcu_retry_check_delay [KNL]
+ Specifies number of microseconds of non-sleeping
+ delay between each non-sleeping poll of readers.
+
srcutree.small_contention_lim [KNL]
Specifies the number of update-side contention
events per jiffy will be tolerated before
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index ebdfeead44e5..937a58b3266b 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -41,6 +41,7 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
void rcu_barrier_tasks(void);
void rcu_barrier_tasks_rude(void);
void synchronize_rcu(void);
+unsigned long get_completed_synchronize_rcu(void);
#ifdef CONFIG_PREEMPT_RCU
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 5fed476f977f..e6bb31a0927b 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -23,6 +23,16 @@ static inline void cond_synchronize_rcu(unsigned long oldstate)
might_sleep();
}
+static inline unsigned long start_poll_synchronize_rcu_expedited(void)
+{
+ return start_poll_synchronize_rcu();
+}
+
+static inline void cond_synchronize_rcu_expedited(unsigned long oldstate)
+{
+ cond_synchronize_rcu(oldstate);
+}
+
extern void rcu_barrier(void);
static inline void synchronize_rcu_expedited(void)
@@ -38,7 +48,7 @@ static inline void synchronize_rcu_expedited(void)
*/
extern void kvfree(const void *addr);
-static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+static inline void __kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
{
if (head) {
call_rcu(head, func);
@@ -51,6 +61,15 @@ static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
kvfree((void *) func);
}
+#ifdef CONFIG_KASAN_GENERIC
+void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func);
+#else
+static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+{
+ __kvfree_call_rcu(head, func);
+}
+#endif
+
void rcu_qs(void);
static inline void rcu_softirq_qs(void)
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 9c6cfb742504..20dbaa9a3882 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -40,6 +40,8 @@ bool rcu_eqs_special_set(int cpu);
void rcu_momentary_dyntick_idle(void);
void kfree_rcu_scheduler_running(void);
bool rcu_gp_might_be_stalled(void);
+unsigned long start_poll_synchronize_rcu_expedited(void);
+void cond_synchronize_rcu_expedited(unsigned long oldstate);
unsigned long get_state_synchronize_rcu(void);
unsigned long start_poll_synchronize_rcu(void);
bool poll_state_synchronize_rcu(unsigned long oldstate);
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 1c630e573548..c05ca52cdf64 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -262,6 +262,35 @@ config RCU_NOCB_CPU
Say Y here if you need reduced OS jitter, despite added overhead.
Say N here if you are unsure.
+config RCU_NOCB_CPU_DEFAULT_ALL
+ bool "Offload RCU callback processing from all CPUs by default"
+ depends on RCU_NOCB_CPU
+ default n
+ help
+ Use this option to offload callback processing from all CPUs
+ by default, in the absence of the rcu_nocbs or nohz_full boot
+ parameter. This also avoids the need to use any boot parameters
+ to achieve the effect of offloading all CPUs on boot.
+
+ Say Y here if you want offload all CPUs by default on boot.
+ Say N here if you are unsure.
+
+config RCU_NOCB_CPU_CB_BOOST
+ bool "Offload RCU callback from real-time kthread"
+ depends on RCU_NOCB_CPU && RCU_BOOST
+ default y if PREEMPT_RT
+ help
+ Use this option to invoke offloaded callbacks as SCHED_FIFO
+ to avoid starvation by heavy SCHED_OTHER background load.
+ Of course, running as SCHED_FIFO during callback floods will
+ cause the rcuo[ps] kthreads to monopolize the CPU for hundreds
+ of milliseconds or more. Therefore, when enabling this option,
+ it is your responsibility to ensure that latency-sensitive
+ tasks either run with higher priority or run on some other CPU.
+
+ Say Y here if you want to set RT priority for offloading kthreads.
+ Say N here if you are building a !PREEMPT_RT kernel and are unsure.
+
config TASKS_TRACE_RCU_READ_MB
bool "Tasks Trace RCU readers use memory barriers in user and idle"
depends on RCU_EXPERT && TASKS_TRACE_RCU
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 9b64e55d4f61..4da05beb13d7 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -121,7 +121,7 @@ config RCU_EQS_DEBUG
config RCU_STRICT_GRACE_PERIOD
bool "Provide debug RCU implementation with short grace periods"
- depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4
+ depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4 && !TINY_RCU
default n
select PREEMPT_COUNT if PREEMPT=n
help
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 4916077119f3..32291f4eefde 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -23,6 +23,9 @@
#define RCU_SEQ_CTR_SHIFT 2
#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1)
+/* Low-order bit definition for polled grace-period APIs. */
+#define RCU_GET_STATE_COMPLETED 0x1
+
extern int sysctl_sched_rt_runtime;
/*
@@ -120,6 +123,18 @@ static inline bool rcu_seq_done(unsigned long *sp, unsigned long s)
}
/*
+ * Given a snapshot from rcu_seq_snap(), determine whether or not a
+ * full update-side operation has occurred, but do not allow the
+ * (ULONG_MAX / 2) safety-factor/guard-band.
+ */
+static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s)
+{
+ unsigned long cur_s = READ_ONCE(*sp);
+
+ return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_STATE_MASK + 1));
+}
+
+/*
* Has a grace period completed since the time the old gp_seq was collected?
*/
static inline bool rcu_seq_completed_gp(unsigned long old, unsigned long new)
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 277a5bfb37d4..3ef02d4a8108 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -419,6 +419,7 @@ rcu_scale_writer(void *arg)
VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started");
WARN_ON(!wdpp);
set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+ current->flags |= PF_NO_SETAFFINITY;
sched_set_fifo_low(current);
if (holdoff)
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 7120165a9342..d8e1b270a065 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -75,62 +75,47 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@
torture_param(int, extendables, RCUTORTURE_MAX_EXTEND,
"Extend readers by disabling bh (1), irqs (2), or preempt (4)");
-torture_param(int, fqs_duration, 0,
- "Duration of fqs bursts (us), 0 to disable");
+torture_param(int, fqs_duration, 0, "Duration of fqs bursts (us), 0 to disable");
torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)");
-torture_param(int, fwd_progress, 1, "Test grace-period forward progress");
+torture_param(int, fwd_progress, 1, "Number of grace-period forward progress tasks (0 to disable)");
torture_param(int, fwd_progress_div, 4, "Fraction of CPU stall to wait");
-torture_param(int, fwd_progress_holdoff, 60,
- "Time between forward-progress tests (s)");
-torture_param(bool, fwd_progress_need_resched, 1,
- "Hide cond_resched() behind need_resched()");
+torture_param(int, fwd_progress_holdoff, 60, "Time between forward-progress tests (s)");
+torture_param(bool, fwd_progress_need_resched, 1, "Hide cond_resched() behind need_resched()");
torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives");
+torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait primitives");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
-torture_param(bool, gp_normal, false,
- "Use normal (non-expedited) GP wait primitives");
+torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives");
torture_param(bool, gp_poll, false, "Use polling GP wait primitives");
+torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives");
torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers");
-torture_param(int, n_barrier_cbs, 0,
- "# of callbacks/kthreads for barrier testing");
+torture_param(int, n_barrier_cbs, 0, "# of callbacks/kthreads for barrier testing");
torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads");
torture_param(int, nreaders, -1, "Number of RCU reader threads");
-torture_param(int, object_debug, 0,
- "Enable debug-object double call_rcu() testing");
+torture_param(int, object_debug, 0, "Enable debug-object double call_rcu() testing");
torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
-torture_param(int, onoff_interval, 0,
- "Time between CPU hotplugs (jiffies), 0=disable");
+torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable");
torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disable");
torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)");
-torture_param(int, read_exit_delay, 13,
- "Delay between read-then-exit episodes (s)");
-torture_param(int, read_exit_burst, 16,
- "# of read-then-exit bursts per episode, zero to disable");
+torture_param(int, read_exit_delay, 13, "Delay between read-then-exit episodes (s)");
+torture_param(int, read_exit_burst, 16, "# of read-then-exit bursts per episode, zero to disable");
torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles");
torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
-torture_param(int, stall_cpu_holdoff, 10,
- "Time to wait before starting stall (s).");
-torture_param(bool, stall_no_softlockup, false,
- "Avoid softlockup warning during cpu stall.");
+torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s).");
+torture_param(bool, stall_no_softlockup, false, "Avoid softlockup warning during cpu stall.");
torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling.");
torture_param(int, stall_cpu_block, 0, "Sleep while stalling.");
-torture_param(int, stall_gp_kthread, 0,
- "Grace-period kthread stall duration (s).");
-torture_param(int, stat_interval, 60,
- "Number of seconds between stats printk()s");
+torture_param(int, stall_gp_kthread, 0, "Grace-period kthread stall duration (s).");
+torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s");
torture_param(int, stutter, 5, "Number of seconds to run/halt test");
torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
-torture_param(int, test_boost_duration, 4,
- "Duration of each boost test, seconds.");
-torture_param(int, test_boost_interval, 7,
- "Interval between boost tests, seconds.");
-torture_param(bool, test_no_idle_hz, true,
- "Test support for tickless idle CPUs");
-torture_param(int, verbose, 1,
- "Enable verbose debugging printk()s");
+torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds.");
+torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds.");
+torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs");
+torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
static char *torture_type = "rcu";
module_param(torture_type, charp, 0444);
@@ -209,12 +194,16 @@ static int rcu_torture_writer_state;
#define RTWS_DEF_FREE 3
#define RTWS_EXP_SYNC 4
#define RTWS_COND_GET 5
-#define RTWS_COND_SYNC 6
-#define RTWS_POLL_GET 7
-#define RTWS_POLL_WAIT 8
-#define RTWS_SYNC 9
-#define RTWS_STUTTER 10
-#define RTWS_STOPPING 11
+#define RTWS_COND_GET_EXP 6
+#define RTWS_COND_SYNC 7
+#define RTWS_COND_SYNC_EXP 8
+#define RTWS_POLL_GET 9
+#define RTWS_POLL_GET_EXP 10
+#define RTWS_POLL_WAIT 11
+#define RTWS_POLL_WAIT_EXP 12
+#define RTWS_SYNC 13
+#define RTWS_STUTTER 14
+#define RTWS_STOPPING 15
static const char * const rcu_torture_writer_state_names[] = {
"RTWS_FIXED_DELAY",
"RTWS_DELAY",
@@ -222,9 +211,13 @@ static const char * const rcu_torture_writer_state_names[] = {
"RTWS_DEF_FREE",
"RTWS_EXP_SYNC",
"RTWS_COND_GET",
+ "RTWS_COND_GET_EXP",
"RTWS_COND_SYNC",
+ "RTWS_COND_SYNC_EXP",
"RTWS_POLL_GET",
+ "RTWS_POLL_GET_EXP",
"RTWS_POLL_WAIT",
+ "RTWS_POLL_WAIT_EXP",
"RTWS_SYNC",
"RTWS_STUTTER",
"RTWS_STOPPING",
@@ -337,7 +330,12 @@ struct rcu_torture_ops {
void (*deferred_free)(struct rcu_torture *p);
void (*sync)(void);
void (*exp_sync)(void);
+ unsigned long (*get_gp_state_exp)(void);
+ unsigned long (*start_gp_poll_exp)(void);
+ bool (*poll_gp_state_exp)(unsigned long oldstate);
+ void (*cond_sync_exp)(unsigned long oldstate);
unsigned long (*get_gp_state)(void);
+ unsigned long (*get_gp_completed)(void);
unsigned long (*start_gp_poll)(void);
bool (*poll_gp_state)(unsigned long oldstate);
void (*cond_sync)(unsigned long oldstate);
@@ -504,9 +502,14 @@ static struct rcu_torture_ops rcu_ops = {
.sync = synchronize_rcu,
.exp_sync = synchronize_rcu_expedited,
.get_gp_state = get_state_synchronize_rcu,
+ .get_gp_completed = get_completed_synchronize_rcu,
.start_gp_poll = start_poll_synchronize_rcu,
.poll_gp_state = poll_state_synchronize_rcu,
.cond_sync = cond_synchronize_rcu,
+ .get_gp_state_exp = get_state_synchronize_rcu,
+ .start_gp_poll_exp = start_poll_synchronize_rcu_expedited,
+ .poll_gp_state_exp = poll_state_synchronize_rcu,
+ .cond_sync_exp = cond_synchronize_rcu_expedited,
.call = call_rcu,
.cb_barrier = rcu_barrier,
.fqs = rcu_force_quiescent_state,
@@ -1136,9 +1139,8 @@ rcu_torture_fqs(void *arg)
return 0;
}
-// Used by writers to randomly choose from the available grace-period
-// primitives. The only purpose of the initialization is to size the array.
-static int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, RTWS_COND_GET, RTWS_POLL_GET, RTWS_SYNC };
+// Used by writers to randomly choose from the available grace-period primitives.
+static int synctype[ARRAY_SIZE(rcu_torture_writer_state_names)] = { };
static int nsynctypes;
/*
@@ -1146,18 +1148,27 @@ static int nsynctypes;
*/
static void rcu_torture_write_types(void)
{
- bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
- bool gp_poll1 = gp_poll, gp_sync1 = gp_sync;
+ bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_exp1 = gp_exp;
+ bool gp_poll_exp1 = gp_poll_exp, gp_normal1 = gp_normal, gp_poll1 = gp_poll;
+ bool gp_sync1 = gp_sync;
/* Initialize synctype[] array. If none set, take default. */
- if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_poll1 && !gp_sync1)
- gp_cond1 = gp_exp1 = gp_normal1 = gp_poll1 = gp_sync1 = true;
+ if (!gp_cond1 && !gp_cond_exp1 && !gp_exp1 && !gp_poll_exp &&
+ !gp_normal1 && !gp_poll1 && !gp_sync1)
+ gp_cond1 = gp_cond_exp1 = gp_exp1 = gp_poll_exp1 =
+ gp_normal1 = gp_poll1 = gp_sync1 = true;
if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) {
synctype[nsynctypes++] = RTWS_COND_GET;
pr_info("%s: Testing conditional GPs.\n", __func__);
} else if (gp_cond && (!cur_ops->get_gp_state || !cur_ops->cond_sync)) {
pr_alert("%s: gp_cond without primitives.\n", __func__);
}
+ if (gp_cond_exp1 && cur_ops->get_gp_state_exp && cur_ops->cond_sync_exp) {
+ synctype[nsynctypes++] = RTWS_COND_GET_EXP;
+ pr_info("%s: Testing conditional expedited GPs.\n", __func__);
+ } else if (gp_cond_exp && (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp)) {
+ pr_alert("%s: gp_cond_exp without primitives.\n", __func__);
+ }
if (gp_exp1 && cur_ops->exp_sync) {
synctype[nsynctypes++] = RTWS_EXP_SYNC;
pr_info("%s: Testing expedited GPs.\n", __func__);
@@ -1176,6 +1187,12 @@ static void rcu_torture_write_types(void)
} else if (gp_poll && (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)) {
pr_alert("%s: gp_poll without primitives.\n", __func__);
}
+ if (gp_poll_exp1 && cur_ops->start_gp_poll_exp && cur_ops->poll_gp_state_exp) {
+ synctype[nsynctypes++] = RTWS_POLL_GET_EXP;
+ pr_info("%s: Testing polling expedited GPs.\n", __func__);
+ } else if (gp_poll_exp && (!cur_ops->start_gp_poll_exp || !cur_ops->poll_gp_state_exp)) {
+ pr_alert("%s: gp_poll_exp without primitives.\n", __func__);
+ }
if (gp_sync1 && cur_ops->sync) {
synctype[nsynctypes++] = RTWS_SYNC;
pr_info("%s: Testing normal GPs.\n", __func__);
@@ -1254,6 +1271,10 @@ rcu_torture_writer(void *arg)
rcu_torture_writer_state_getname(),
rcu_torture_writer_state,
cookie, cur_ops->get_gp_state());
+ if (cur_ops->get_gp_completed) {
+ cookie = cur_ops->get_gp_completed();
+ WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie));
+ }
cur_ops->readunlock(idx);
}
switch (synctype[torture_random(&rand) % nsynctypes]) {
@@ -1263,7 +1284,12 @@ rcu_torture_writer(void *arg)
break;
case RTWS_EXP_SYNC:
rcu_torture_writer_state = RTWS_EXP_SYNC;
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+ cookie = cur_ops->get_gp_state();
cur_ops->exp_sync();
+ cur_ops->exp_sync();
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+ WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie));
rcu_torture_pipe_update(old_rp);
break;
case RTWS_COND_GET:
@@ -1274,6 +1300,14 @@ rcu_torture_writer(void *arg)
cur_ops->cond_sync(gp_snap);
rcu_torture_pipe_update(old_rp);
break;
+ case RTWS_COND_GET_EXP:
+ rcu_torture_writer_state = RTWS_COND_GET_EXP;
+ gp_snap = cur_ops->get_gp_state_exp();
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ rcu_torture_writer_state = RTWS_COND_SYNC_EXP;
+ cur_ops->cond_sync_exp(gp_snap);
+ rcu_torture_pipe_update(old_rp);
+ break;
case RTWS_POLL_GET:
rcu_torture_writer_state = RTWS_POLL_GET;
gp_snap = cur_ops->start_gp_poll();
@@ -1283,9 +1317,23 @@ rcu_torture_writer(void *arg)
&rand);
rcu_torture_pipe_update(old_rp);
break;
+ case RTWS_POLL_GET_EXP:
+ rcu_torture_writer_state = RTWS_POLL_GET_EXP;
+ gp_snap = cur_ops->start_gp_poll_exp();
+ rcu_torture_writer_state = RTWS_POLL_WAIT_EXP;
+ while (!cur_ops->poll_gp_state_exp(gp_snap))
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ rcu_torture_pipe_update(old_rp);
+ break;
case RTWS_SYNC:
rcu_torture_writer_state = RTWS_SYNC;
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+ cookie = cur_ops->get_gp_state();
cur_ops->sync();
+ cur_ops->sync();
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+ WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie));
rcu_torture_pipe_update(old_rp);
break;
default:
@@ -1321,8 +1369,9 @@ rcu_torture_writer(void *arg)
if (list_empty(&rcu_tortures[i].rtort_free) &&
rcu_access_pointer(rcu_torture_current) !=
&rcu_tortures[i]) {
- rcu_ftrace_dump(DUMP_ALL);
+ tracing_off();
WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
+ rcu_ftrace_dump(DUMP_ALL);
}
if (stutter_waited)
sched_set_normal(current, oldnice);
@@ -1384,6 +1433,11 @@ rcu_torture_fakewriter(void *arg)
torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
cur_ops->cond_sync(gp_snap);
break;
+ case RTWS_COND_GET_EXP:
+ gp_snap = cur_ops->get_gp_state_exp();
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ cur_ops->cond_sync_exp(gp_snap);
+ break;
case RTWS_POLL_GET:
gp_snap = cur_ops->start_gp_poll();
while (!cur_ops->poll_gp_state(gp_snap)) {
@@ -1391,6 +1445,13 @@ rcu_torture_fakewriter(void *arg)
&rand);
}
break;
+ case RTWS_POLL_GET_EXP:
+ gp_snap = cur_ops->start_gp_poll_exp();
+ while (!cur_ops->poll_gp_state_exp(gp_snap)) {
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ break;
case RTWS_SYNC:
cur_ops->sync();
break;
@@ -1868,7 +1929,7 @@ rcu_torture_stats_print(void)
batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]);
}
}
- for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
+ for (i = RCU_TORTURE_PIPE_LEN; i >= 0; i--) {
if (pipesummary[i] != 0)
break;
}
@@ -1990,7 +2051,13 @@ static void rcu_torture_mem_dump_obj(void)
static int z;
kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL);
+ if (WARN_ON_ONCE(!kcp))
+ return;
rhp = kmem_cache_alloc(kcp, GFP_KERNEL);
+ if (WARN_ON_ONCE(!rhp)) {
+ kmem_cache_destroy(kcp);
+ return;
+ }
pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z);
pr_alert("mem_dump_obj(ZERO_SIZE_PTR):");
mem_dump_obj(ZERO_SIZE_PTR);
@@ -2007,6 +2074,8 @@ static void rcu_torture_mem_dump_obj(void)
kmem_cache_free(kcp, rhp);
kmem_cache_destroy(kcp);
rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
+ if (WARN_ON_ONCE(!rhp))
+ return;
pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
pr_alert("mem_dump_obj(kmalloc %px):", rhp);
mem_dump_obj(rhp);
@@ -2014,6 +2083,8 @@ static void rcu_torture_mem_dump_obj(void)
mem_dump_obj(&rhp->func);
kfree(rhp);
rhp = vmalloc(4096);
+ if (WARN_ON_ONCE(!rhp))
+ return;
pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
pr_alert("mem_dump_obj(vmalloc %px):", rhp);
mem_dump_obj(rhp);
@@ -2075,6 +2146,19 @@ static int rcutorture_booster_init(unsigned int cpu)
if (boost_tasks[cpu] != NULL)
return 0; /* Already created, nothing more to do. */
+ // Testing RCU priority boosting requires rcutorture do
+ // some serious abuse. Counter this by running ksoftirqd
+ // at higher priority.
+ if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) {
+ struct sched_param sp;
+ struct task_struct *t;
+
+ t = per_cpu(ksoftirqd, cpu);
+ WARN_ON_ONCE(!t);
+ sp.sched_priority = 2;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ }
+
/* Don't allow time recalculation while creating a new task. */
mutex_lock(&boost_mutex);
rcu_torture_disable_rt_throttle();
@@ -2873,7 +2957,6 @@ static int rcu_torture_read_exit_child(void *trsp_in)
// Parent kthread which creates and destroys read-exit child kthreads.
static int rcu_torture_read_exit(void *unused)
{
- int count = 0;
bool errexit = false;
int i;
struct task_struct *tsp;
@@ -2885,34 +2968,28 @@ static int rcu_torture_read_exit(void *unused)
// Each pass through this loop does one read-exit episode.
do {
- if (++count > read_exit_burst) {
- VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode");
- rcu_barrier(); // Wait for task_struct free, avoid OOM.
- for (i = 0; i < read_exit_delay; i++) {
- schedule_timeout_uninterruptible(HZ);
- if (READ_ONCE(read_exit_child_stop))
- break;
+ VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode");
+ for (i = 0; i < read_exit_burst; i++) {
+ if (READ_ONCE(read_exit_child_stop))
+ break;
+ stutter_wait("rcu_torture_read_exit");
+ // Spawn child.
+ tsp = kthread_run(rcu_torture_read_exit_child,
+ &trs, "%s", "rcu_torture_read_exit_child");
+ if (IS_ERR(tsp)) {
+ TOROUT_ERRSTRING("out of memory");
+ errexit = true;
+ break;
}
- if (!READ_ONCE(read_exit_child_stop))
- VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode");
- count = 0;
- }
- if (READ_ONCE(read_exit_child_stop))
- break;
- // Spawn child.
- tsp = kthread_run(rcu_torture_read_exit_child,
- &trs, "%s",
- "rcu_torture_read_exit_child");
- if (IS_ERR(tsp)) {
- TOROUT_ERRSTRING("out of memory");
- errexit = true;
- tsp = NULL;
- break;
+ cond_resched();
+ kthread_stop(tsp);
+ n_read_exits++;
}
- cond_resched();
- kthread_stop(tsp);
- n_read_exits ++;
- stutter_wait("rcu_torture_read_exit");
+ VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode");
+ rcu_barrier(); // Wait for task_struct free, avoid OOM.
+ i = 0;
+ for (; !errexit && !READ_ONCE(read_exit_child_stop) && i < read_exit_delay; i++)
+ schedule_timeout_uninterruptible(HZ);
} while (!errexit && !READ_ONCE(read_exit_child_stop));
// Clean up and exit.
@@ -3122,6 +3199,7 @@ static void rcu_test_debug_objects(void)
pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME);
destroy_rcu_head_on_stack(&rh1);
destroy_rcu_head_on_stack(&rh2);
+ kfree(rhp);
#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME);
#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
@@ -3329,21 +3407,6 @@ rcu_torture_init(void)
rcutor_hp = firsterr;
if (torture_init_error(firsterr))
goto unwind;
-
- // Testing RCU priority boosting requires rcutorture do
- // some serious abuse. Counter this by running ksoftirqd
- // at higher priority.
- if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) {
- for_each_online_cpu(cpu) {
- struct sched_param sp;
- struct task_struct *t;
-
- t = per_cpu(ksoftirqd, cpu);
- WARN_ON_ONCE(!t);
- sp.sched_priority = 2;
- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
- }
- }
}
shutdown_jiffies = jiffies + shutdown_secs * HZ;
firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 909644abee67..435c884c02b5 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -385,7 +385,7 @@ static struct ref_scale_ops rwsem_ops = {
};
// Definitions for global spinlock
-static DEFINE_SPINLOCK(test_lock);
+static DEFINE_RAW_SPINLOCK(test_lock);
static void ref_lock_section(const int nloops)
{
@@ -393,8 +393,8 @@ static void ref_lock_section(const int nloops)
preempt_disable();
for (i = nloops; i >= 0; i--) {
- spin_lock(&test_lock);
- spin_unlock(&test_lock);
+ raw_spin_lock(&test_lock);
+ raw_spin_unlock(&test_lock);
}
preempt_enable();
}
@@ -405,9 +405,9 @@ static void ref_lock_delay_section(const int nloops, const int udl, const int nd
preempt_disable();
for (i = nloops; i >= 0; i--) {
- spin_lock(&test_lock);
+ raw_spin_lock(&test_lock);
un_delay(udl, ndl);
- spin_unlock(&test_lock);
+ raw_spin_unlock(&test_lock);
}
preempt_enable();
}
@@ -427,8 +427,8 @@ static void ref_lock_irq_section(const int nloops)
preempt_disable();
for (i = nloops; i >= 0; i--) {
- spin_lock_irqsave(&test_lock, flags);
- spin_unlock_irqrestore(&test_lock, flags);
+ raw_spin_lock_irqsave(&test_lock, flags);
+ raw_spin_unlock_irqrestore(&test_lock, flags);
}
preempt_enable();
}
@@ -440,9 +440,9 @@ static void ref_lock_irq_delay_section(const int nloops, const int udl, const in
preempt_disable();
for (i = nloops; i >= 0; i--) {
- spin_lock_irqsave(&test_lock, flags);
+ raw_spin_lock_irqsave(&test_lock, flags);
un_delay(udl, ndl);
- spin_unlock_irqrestore(&test_lock, flags);
+ raw_spin_unlock_irqrestore(&test_lock, flags);
}
preempt_enable();
}
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 50ba70f019de..1c304fec89c0 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -511,10 +511,52 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
return sum;
}
-#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending.
-#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers.
-#define SRCU_MAX_NODELAY_PHASE 1 // Maximum per-GP-phase consecutive no-delay instances.
-#define SRCU_MAX_NODELAY 100 // Maximum consecutive no-delay instances.
+/*
+ * We use an adaptive strategy for synchronize_srcu() and especially for
+ * synchronize_srcu_expedited(). We spin for a fixed time period
+ * (defined below, boot time configurable) to allow SRCU readers to exit
+ * their read-side critical sections. If there are still some readers
+ * after one jiffy, we repeatedly block for one jiffy time periods.
+ * The blocking time is increased as the grace-period age increases,
+ * with max blocking time capped at 10 jiffies.
+ */
+#define SRCU_DEFAULT_RETRY_CHECK_DELAY 5
+
+static ulong srcu_retry_check_delay = SRCU_DEFAULT_RETRY_CHECK_DELAY;
+module_param(srcu_retry_check_delay, ulong, 0444);
+
+#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending.
+#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers.
+
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE_LO 3UL // Lowmark on default per-GP-phase
+ // no-delay instances.
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE_HI 1000UL // Highmark on default per-GP-phase
+ // no-delay instances.
+
+#define SRCU_UL_CLAMP_LO(val, low) ((val) > (low) ? (val) : (low))
+#define SRCU_UL_CLAMP_HI(val, high) ((val) < (high) ? (val) : (high))
+#define SRCU_UL_CLAMP(val, low, high) SRCU_UL_CLAMP_HI(SRCU_UL_CLAMP_LO((val), (low)), (high))
+// per-GP-phase no-delay instances adjusted to allow non-sleeping poll upto
+// one jiffies time duration. Mult by 2 is done to factor in the srcu_get_delay()
+// called from process_srcu().
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED \
+ (2UL * USEC_PER_SEC / HZ / SRCU_DEFAULT_RETRY_CHECK_DELAY)
+
+// Maximum per-GP-phase consecutive no-delay instances.
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE \
+ SRCU_UL_CLAMP(SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED, \
+ SRCU_DEFAULT_MAX_NODELAY_PHASE_LO, \
+ SRCU_DEFAULT_MAX_NODELAY_PHASE_HI)
+
+static ulong srcu_max_nodelay_phase = SRCU_DEFAULT_MAX_NODELAY_PHASE;
+module_param(srcu_max_nodelay_phase, ulong, 0444);
+
+// Maximum consecutive no-delay instances.
+#define SRCU_DEFAULT_MAX_NODELAY (SRCU_DEFAULT_MAX_NODELAY_PHASE > 100 ? \
+ SRCU_DEFAULT_MAX_NODELAY_PHASE : 100)
+
+static ulong srcu_max_nodelay = SRCU_DEFAULT_MAX_NODELAY;
+module_param(srcu_max_nodelay, ulong, 0444);
/*
* Return grace-period delay, zero if there are expedited grace
@@ -522,16 +564,22 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
*/
static unsigned long srcu_get_delay(struct srcu_struct *ssp)
{
+ unsigned long gpstart;
+ unsigned long j;
unsigned long jbase = SRCU_INTERVAL;
if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
jbase = 0;
- if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)))
- jbase += jiffies - READ_ONCE(ssp->srcu_gp_start);
- if (!jbase) {
- WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1);
- if (READ_ONCE(ssp->srcu_n_exp_nodelay) > SRCU_MAX_NODELAY_PHASE)
- jbase = 1;
+ if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) {
+ j = jiffies - 1;
+ gpstart = READ_ONCE(ssp->srcu_gp_start);
+ if (time_after(j, gpstart))
+ jbase += j - gpstart;
+ if (!jbase) {
+ WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1);
+ if (READ_ONCE(ssp->srcu_n_exp_nodelay) > srcu_max_nodelay_phase)
+ jbase = 1;
+ }
}
return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase;
}
@@ -607,15 +655,6 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
/*
- * We use an adaptive strategy for synchronize_srcu() and especially for
- * synchronize_srcu_expedited(). We spin for a fixed time period
- * (defined below) to allow SRCU readers to exit their read-side critical
- * sections. If there are still some readers after a few microseconds,
- * we repeatedly block for 1-millisecond time periods.
- */
-#define SRCU_RETRY_CHECK_DELAY 5
-
-/*
* Start an SRCU grace period.
*/
static void srcu_gp_start(struct srcu_struct *ssp)
@@ -700,7 +739,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp
*/
static void srcu_gp_end(struct srcu_struct *ssp)
{
- unsigned long cbdelay;
+ unsigned long cbdelay = 1;
bool cbs;
bool last_lvl;
int cpu;
@@ -720,7 +759,9 @@ static void srcu_gp_end(struct srcu_struct *ssp)
spin_lock_irq_rcu_node(ssp);
idx = rcu_seq_state(ssp->srcu_gp_seq);
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
- cbdelay = !!srcu_get_delay(ssp);
+ if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
+ cbdelay = 0;
+
WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns());
rcu_seq_end(&ssp->srcu_gp_seq);
gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
@@ -921,12 +962,16 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
*/
static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
{
+ unsigned long curdelay;
+
+ curdelay = !srcu_get_delay(ssp);
+
for (;;) {
if (srcu_readers_active_idx_check(ssp, idx))
return true;
- if (--trycount + !srcu_get_delay(ssp) <= 0)
+ if ((--trycount + curdelay) <= 0)
return false;
- udelay(SRCU_RETRY_CHECK_DELAY);
+ udelay(srcu_retry_check_delay);
}
}
@@ -1582,7 +1627,7 @@ static void process_srcu(struct work_struct *work)
j = jiffies;
if (READ_ONCE(ssp->reschedule_jiffies) == j) {
WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1);
- if (READ_ONCE(ssp->reschedule_count) > SRCU_MAX_NODELAY)
+ if (READ_ONCE(ssp->reschedule_count) > srcu_max_nodelay)
curdelay = 1;
} else {
WRITE_ONCE(ssp->reschedule_count, 1);
@@ -1674,6 +1719,11 @@ static int __init srcu_bootup_announce(void)
pr_info("Hierarchical SRCU implementation.\n");
if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF)
pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff);
+ if (srcu_retry_check_delay != SRCU_DEFAULT_RETRY_CHECK_DELAY)
+ pr_info("\tNon-default retry check delay of %lu us.\n", srcu_retry_check_delay);
+ if (srcu_max_nodelay != SRCU_DEFAULT_MAX_NODELAY)
+ pr_info("\tNon-default max no-delay of %lu.\n", srcu_max_nodelay);
+ pr_info("\tMax phase no-delay instances is %lu.\n", srcu_max_nodelay_phase);
return 0;
}
early_initcall(srcu_bootup_announce);
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 340b3f8b090d..f0561ee16b9c 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -58,7 +58,7 @@ void rcu_qs(void)
rcu_ctrlblk.donetail = rcu_ctrlblk.curtail;
raise_softirq_irqoff(RCU_SOFTIRQ);
}
- WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 1);
+ WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2);
local_irq_restore(flags);
}
@@ -139,8 +139,10 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused
/*
* Wait for a grace period to elapse. But it is illegal to invoke
* synchronize_rcu() from within an RCU read-side critical section.
- * Therefore, any legal call to synchronize_rcu() is a quiescent
- * state, and so on a UP system, synchronize_rcu() need do nothing.
+ * Therefore, any legal call to synchronize_rcu() is a quiescent state,
+ * and so on a UP system, synchronize_rcu() need do nothing, other than
+ * let the polled APIs know that another grace period elapsed.
+ *
* (But Lai Jiangshan points out the benefits of doing might_sleep()
* to reduce latency.)
*
@@ -152,6 +154,7 @@ void synchronize_rcu(void)
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu() in RCU read-side critical section");
+ WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2);
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
@@ -213,10 +216,24 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
*/
bool poll_state_synchronize_rcu(unsigned long oldstate)
{
- return READ_ONCE(rcu_ctrlblk.gp_seq) != oldstate;
+ return oldstate == RCU_GET_STATE_COMPLETED || READ_ONCE(rcu_ctrlblk.gp_seq) != oldstate;
}
EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
+#ifdef CONFIG_KASAN_GENERIC
+void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+{
+ if (head) {
+ void *ptr = (void *) head - (unsigned long) func;
+
+ kasan_record_aux_stack_noalloc(ptr);
+ }
+
+ __kvfree_call_rcu(head, func);
+}
+EXPORT_SYMBOL_GPL(kvfree_call_rcu);
+#endif
+
void __init rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index c25ba442044a..62e514713f7a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -154,7 +154,11 @@ static void sync_sched_exp_online_cleanup(int cpu);
static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
static bool rcu_rdp_is_offloaded(struct rcu_data *rdp);
-/* rcuc/rcub/rcuop kthread realtime priority */
+/*
+ * rcuc/rcub/rcuop kthread realtime priority. The "rcuop"
+ * real-time priority(enabling/disabling) is controlled by
+ * the extra CONFIG_RCU_NOCB_CPU_CB_BOOST configuration.
+ */
static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
module_param(kthread_prio, int, 0444);
@@ -1775,6 +1779,79 @@ static void rcu_strict_gp_boundary(void *unused)
invoke_rcu_core();
}
+// Has rcu_init() been invoked? This is used (for example) to determine
+// whether spinlocks may be acquired safely.
+static bool rcu_init_invoked(void)
+{
+ return !!rcu_state.n_online_cpus;
+}
+
+// Make the polled API aware of the beginning of a grace period.
+static void rcu_poll_gp_seq_start(unsigned long *snap)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ if (rcu_init_invoked())
+ raw_lockdep_assert_held_rcu_node(rnp);
+
+ // If RCU was idle, note beginning of GP.
+ if (!rcu_seq_state(rcu_state.gp_seq_polled))
+ rcu_seq_start(&rcu_state.gp_seq_polled);
+
+ // Either way, record current state.
+ *snap = rcu_state.gp_seq_polled;
+}
+
+// Make the polled API aware of the end of a grace period.
+static void rcu_poll_gp_seq_end(unsigned long *snap)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ if (rcu_init_invoked())
+ raw_lockdep_assert_held_rcu_node(rnp);
+
+ // If the previously noted GP is still in effect, record the
+ // end of that GP. Either way, zero counter to avoid counter-wrap
+ // problems.
+ if (*snap && *snap == rcu_state.gp_seq_polled) {
+ rcu_seq_end(&rcu_state.gp_seq_polled);
+ rcu_state.gp_seq_polled_snap = 0;
+ rcu_state.gp_seq_polled_exp_snap = 0;
+ } else {
+ *snap = 0;
+ }
+}
+
+// Make the polled API aware of the beginning of a grace period, but
+// where caller does not hold the root rcu_node structure's lock.
+static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ if (rcu_init_invoked()) {
+ lockdep_assert_irqs_enabled();
+ raw_spin_lock_irq_rcu_node(rnp);
+ }
+ rcu_poll_gp_seq_start(snap);
+ if (rcu_init_invoked())
+ raw_spin_unlock_irq_rcu_node(rnp);
+}
+
+// Make the polled API aware of the end of a grace period, but where
+// caller does not hold the root rcu_node structure's lock.
+static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ if (rcu_init_invoked()) {
+ lockdep_assert_irqs_enabled();
+ raw_spin_lock_irq_rcu_node(rnp);
+ }
+ rcu_poll_gp_seq_end(snap);
+ if (rcu_init_invoked())
+ raw_spin_unlock_irq_rcu_node(rnp);
+}
+
/*
* Initialize a new grace period. Return false if no grace period required.
*/
@@ -1810,6 +1887,7 @@ static noinline_for_stack bool rcu_gp_init(void)
rcu_seq_start(&rcu_state.gp_seq);
ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
+ rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap);
raw_spin_unlock_irq_rcu_node(rnp);
/*
@@ -1971,19 +2049,23 @@ static void rcu_gp_fqs(bool first_time)
*/
static noinline_for_stack void rcu_gp_fqs_loop(void)
{
- bool first_gp_fqs;
+ bool first_gp_fqs = true;
int gf = 0;
unsigned long j;
int ret;
struct rcu_node *rnp = rcu_get_root();
- first_gp_fqs = true;
j = READ_ONCE(jiffies_till_first_fqs);
if (rcu_state.cbovld)
gf = RCU_GP_FLAG_OVLD;
ret = 0;
for (;;) {
- if (!ret) {
+ if (rcu_state.cbovld) {
+ j = (j + 2) / 3;
+ if (j <= 0)
+ j = 1;
+ }
+ if (!ret || time_before(jiffies + j, rcu_state.jiffies_force_qs)) {
WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j);
/*
* jiffies_force_qs before RCU_GP_WAIT_FQS state
@@ -2001,7 +2083,15 @@ static noinline_for_stack void rcu_gp_fqs_loop(void)
rcu_gp_torture_wait();
WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS);
/* Locking provides needed memory barriers. */
- /* If grace period done, leave loop. */
+ /*
+ * Exit the loop if the root rcu_node structure indicates that the grace period
+ * has ended, leave the loop. The rcu_preempt_blocked_readers_cgp(rnp) check
+ * is required only for single-node rcu_node trees because readers blocking
+ * the current grace period are queued only on leaf rcu_node structures.
+ * For multi-node trees, checking the root node's ->qsmask suffices, because a
+ * given root node's ->qsmask bit is cleared only when all CPUs and tasks from
+ * the corresponding leaf nodes have passed through their quiescent state.
+ */
if (!READ_ONCE(rnp->qsmask) &&
!rcu_preempt_blocked_readers_cgp(rnp))
break;
@@ -2069,6 +2159,7 @@ static noinline void rcu_gp_cleanup(void)
* safe for us to drop the lock in order to mark the grace
* period as completed in all of the rcu_node structures.
*/
+ rcu_poll_gp_seq_end(&rcu_state.gp_seq_polled_snap);
raw_spin_unlock_irq_rcu_node(rnp);
/*
@@ -2530,7 +2621,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
trace_rcu_batch_end(rcu_state.name, 0,
!rcu_segcblist_empty(&rdp->cblist),
need_resched(), is_idle_task(current),
- rcu_is_callbacks_kthread());
+ rcu_is_callbacks_kthread(rdp));
return;
}
@@ -2608,7 +2699,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
rcu_nocb_lock_irqsave(rdp, flags);
rdp->n_cbs_invoked += count;
trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
- is_idle_task(current), rcu_is_callbacks_kthread());
+ is_idle_task(current), rcu_is_callbacks_kthread(rdp));
/* Update counts and requeue any remaining callbacks. */
rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
@@ -3211,7 +3302,6 @@ struct kfree_rcu_cpu_work {
* @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
* @lock: Synchronize access to this structure
* @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
- * @monitor_todo: Tracks whether a @monitor_work delayed work is pending
* @initialized: The @rcu_work fields have been initialized
* @count: Number of objects for which GP not started
* @bkvcache:
@@ -3236,7 +3326,6 @@ struct kfree_rcu_cpu {
struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
raw_spinlock_t lock;
struct delayed_work monitor_work;
- bool monitor_todo;
bool initialized;
int count;
@@ -3416,6 +3505,18 @@ static void kfree_rcu_work(struct work_struct *work)
}
}
+static bool
+need_offload_krc(struct kfree_rcu_cpu *krcp)
+{
+ int i;
+
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ if (krcp->bkvhead[i])
+ return true;
+
+ return !!krcp->head;
+}
+
/*
* This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
*/
@@ -3472,9 +3573,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
// of the channels that is still busy we should rearm the
// work to repeat an attempt. Because previous batches are
// still in progress.
- if (!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head)
- krcp->monitor_todo = false;
- else
+ if (need_offload_krc(krcp))
schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
raw_spin_unlock_irqrestore(&krcp->lock, flags);
@@ -3662,11 +3761,8 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
WRITE_ONCE(krcp->count, krcp->count + 1);
// Set timer to drain after KFREE_DRAIN_JIFFIES.
- if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
- !krcp->monitor_todo) {
- krcp->monitor_todo = true;
+ if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
- }
unlock_return:
krc_this_cpu_unlock(krcp, flags);
@@ -3741,14 +3837,8 @@ void __init kfree_rcu_scheduler_running(void)
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
raw_spin_lock_irqsave(&krcp->lock, flags);
- if ((!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head) ||
- krcp->monitor_todo) {
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
- continue;
- }
- krcp->monitor_todo = true;
- schedule_delayed_work_on(cpu, &krcp->monitor_work,
- KFREE_DRAIN_JIFFIES);
+ if (need_offload_krc(krcp))
+ schedule_delayed_work_on(cpu, &krcp->monitor_work, KFREE_DRAIN_JIFFIES);
raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
}
@@ -3837,8 +3927,18 @@ void synchronize_rcu(void)
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu() in RCU read-side critical section");
- if (rcu_blocking_is_gp())
+ if (rcu_blocking_is_gp()) {
+ // Note well that this code runs with !PREEMPT && !SMP.
+ // In addition, all code that advances grace periods runs at
+ // process level. Therefore, this normal GP overlaps with
+ // other normal GPs only by being fully nested within them,
+ // which allows reuse of ->gp_seq_polled_snap.
+ rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap);
+ rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap);
+ if (rcu_init_invoked())
+ cond_resched_tasks_rcu_qs();
return; // Context allows vacuous grace periods.
+ }
if (rcu_gp_is_expedited())
synchronize_rcu_expedited();
else
@@ -3860,7 +3960,7 @@ unsigned long get_state_synchronize_rcu(void)
* before the load from ->gp_seq.
*/
smp_mb(); /* ^^^ */
- return rcu_seq_snap(&rcu_state.gp_seq);
+ return rcu_seq_snap(&rcu_state.gp_seq_polled);
}
EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
@@ -3889,7 +3989,13 @@ unsigned long start_poll_synchronize_rcu(void)
rdp = this_cpu_ptr(&rcu_data);
rnp = rdp->mynode;
raw_spin_lock_rcu_node(rnp); // irqs already disabled.
- needwake = rcu_start_this_gp(rnp, rdp, gp_seq);
+ // Note it is possible for a grace period to have elapsed between
+ // the above call to get_state_synchronize_rcu() and the below call
+ // to rcu_seq_snap. This is OK, the worst that happens is that we
+ // get a grace period that no one needed. These accesses are ordered
+ // by smp_mb(), and we are accessing them in the opposite order
+ // from which they are updated at grace-period start, as required.
+ needwake = rcu_start_this_gp(rnp, rdp, rcu_seq_snap(&rcu_state.gp_seq));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (needwake)
rcu_gp_kthread_wake();
@@ -3911,7 +4017,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
*
* Yes, this function does not take counter wrap into account.
* But counter wrap is harmless. If the counter wraps, we have waited for
- * more than 2 billion grace periods (and way more on a 64-bit system!).
+ * more than a billion grace periods (and way more on a 64-bit system!).
* Those needing to keep oldstate values for very long time periods
* (many hours even on 32-bit systems) should check them occasionally
* and either refresh them or set a flag indicating that the grace period
@@ -3924,7 +4030,8 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
*/
bool poll_state_synchronize_rcu(unsigned long oldstate)
{
- if (rcu_seq_done(&rcu_state.gp_seq, oldstate)) {
+ if (oldstate == RCU_GET_STATE_COMPLETED ||
+ rcu_seq_done_exact(&rcu_state.gp_seq_polled, oldstate)) {
smp_mb(); /* Ensure GP ends before subsequent accesses. */
return true;
}
@@ -3935,20 +4042,20 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
/**
* cond_synchronize_rcu - Conditionally wait for an RCU grace period
*
- * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
+ * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
*
* If a full RCU grace period has elapsed since the earlier call to
* get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return.
* Otherwise, invoke synchronize_rcu() to wait for a full grace period.
*
- * Yes, this function does not take counter wrap into account. But
- * counter wrap is harmless. If the counter wraps, we have waited for
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
* more than 2 billion grace periods (and way more on a 64-bit system!),
- * so waiting for one additional grace period should be just fine.
+ * so waiting for a couple of additional grace periods should be just fine.
*
* This function provides the same memory-ordering guarantees that
* would be provided by a synchronize_rcu() that was invoked at the call
- * to the function that provided @oldstate, and that returned at the end
+ * to the function that provided @oldstate and that returned at the end
* of this function.
*/
void cond_synchronize_rcu(unsigned long oldstate)
@@ -4441,6 +4548,7 @@ void rcu_report_dead(unsigned int cpu)
rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags);
if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */
/* Report quiescent state -before- changing ->qsmaskinitnext! */
+ rcu_disable_urgency_upon_qs(rdp);
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
raw_spin_lock_irqsave_rcu_node(rnp, flags);
}
@@ -4486,6 +4594,7 @@ void rcutree_migrate_callbacks(int cpu)
needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp);
rcu_segcblist_disable(&rdp->cblist);
WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist));
+ check_cb_ovld_locked(my_rdp, my_rnp);
if (rcu_rdp_is_offloaded(my_rdp)) {
raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
__call_rcu_nocb_wake(my_rdp, true, flags);
@@ -4701,6 +4810,9 @@ static void __init rcu_init_one(void)
init_waitqueue_head(&rnp->exp_wq[3]);
spin_lock_init(&rnp->exp_lock);
mutex_init(&rnp->boost_kthread_mutex);
+ raw_spin_lock_init(&rnp->exp_poll_lock);
+ rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
+ INIT_WORK(&rnp->exp_poll_wq, sync_rcu_do_polled_gp);
}
}
@@ -4926,6 +5038,10 @@ void __init rcu_init(void)
qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark;
else
qovld_calc = qovld;
+
+ // Kick-start any polled grace periods that started early.
+ if (!(per_cpu_ptr(&rcu_data, cpu)->mynode->exp_seq_poll_rq & 0x1))
+ (void)start_poll_synchronize_rcu_expedited();
}
#include "tree_stall.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 2ccf5845957d..3cdc18997a38 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -133,6 +133,10 @@ struct rcu_node {
wait_queue_head_t exp_wq[4];
struct rcu_exp_work rew;
bool exp_need_flush; /* Need to flush workitem? */
+ raw_spinlock_t exp_poll_lock;
+ /* Lock and data for polled expedited grace periods. */
+ unsigned long exp_seq_poll_rq;
+ struct work_struct exp_poll_wq;
} ____cacheline_internodealigned_in_smp;
/*
@@ -235,6 +239,7 @@ struct rcu_data {
* if rdp_gp.
*/
struct list_head nocb_entry_rdp; /* rcu_data node in wakeup chain. */
+ struct rcu_data *nocb_toggling_rdp; /* rdp queued for (de-)offloading */
/* The following fields are used by CB kthread, hence new cacheline. */
struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp;
@@ -323,6 +328,9 @@ struct rcu_state {
short gp_state; /* GP kthread sleep state. */
unsigned long gp_wake_time; /* Last GP kthread wake. */
unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */
+ unsigned long gp_seq_polled; /* GP seq for polled API. */
+ unsigned long gp_seq_polled_snap; /* ->gp_seq_polled at normal GP start. */
+ unsigned long gp_seq_polled_exp_snap; /* ->gp_seq_polled at expedited GP start. */
/* End of fields guarded by root rcu_node's lock. */
@@ -425,7 +433,7 @@ static void rcu_flavor_sched_clock_irq(int user);
static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
-static bool rcu_is_callbacks_kthread(void);
+static bool rcu_is_callbacks_kthread(struct rcu_data *rdp);
static void rcu_cpu_kthread_setup(unsigned int cpu);
static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp);
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
@@ -481,3 +489,6 @@ static void rcu_iw_handler(struct irq_work *iwp);
static void check_cpu_stall(struct rcu_data *rdp);
static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
const unsigned long gpssdelay);
+
+/* Forward declarations for tree_exp.h. */
+static void sync_rcu_do_polled_gp(struct work_struct *wp);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 0f70f62039a9..f092c7f18a5f 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -18,6 +18,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp);
static void rcu_exp_gp_seq_start(void)
{
rcu_seq_start(&rcu_state.expedited_sequence);
+ rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap);
}
/*
@@ -34,6 +35,7 @@ static __maybe_unused unsigned long rcu_exp_gp_seq_endval(void)
*/
static void rcu_exp_gp_seq_end(void)
{
+ rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap);
rcu_seq_end(&rcu_state.expedited_sequence);
smp_mb(); /* Ensure that consecutive grace periods serialize. */
}
@@ -621,7 +623,6 @@ static void synchronize_rcu_expedited_wait(void)
return;
if (rcu_stall_is_suppressed())
continue;
- panic_on_rcu_stall();
trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall"));
pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
rcu_state.name);
@@ -636,10 +637,11 @@ static void synchronize_rcu_expedited_wait(void)
continue;
ndetected++;
rdp = per_cpu_ptr(&rcu_data, cpu);
- pr_cont(" %d-%c%c%c", cpu,
+ pr_cont(" %d-%c%c%c%c", cpu,
"O."[!!cpu_online(cpu)],
"o."[!!(rdp->grpmask & rnp->expmaskinit)],
- "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
+ "N."[!!(rdp->grpmask & rnp->expmaskinitnext)],
+ "D."[!!(rdp->cpu_no_qs.b.exp)]);
}
}
pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
@@ -669,6 +671,7 @@ static void synchronize_rcu_expedited_wait(void)
}
}
jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3;
+ panic_on_rcu_stall();
}
}
@@ -913,8 +916,18 @@ void synchronize_rcu_expedited(void)
"Illegal synchronize_rcu_expedited() in RCU read-side critical section");
/* Is the state is such that the call is a grace period? */
- if (rcu_blocking_is_gp())
- return;
+ if (rcu_blocking_is_gp()) {
+ // Note well that this code runs with !PREEMPT && !SMP.
+ // In addition, all code that advances grace periods runs
+ // at process level. Therefore, this expedited GP overlaps
+ // with other expedited GPs only by being fully nested within
+ // them, which allows reuse of ->gp_seq_polled_exp_snap.
+ rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap);
+ rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap);
+ if (rcu_init_invoked())
+ cond_resched();
+ return; // Context allows vacuous grace periods.
+ }
/* If expedited grace periods are prohibited, fall back to normal. */
if (rcu_gp_is_normal()) {
@@ -950,3 +963,93 @@ void synchronize_rcu_expedited(void)
synchronize_rcu_expedited_destroy_work(&rew);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
+/*
+ * Ensure that start_poll_synchronize_rcu_expedited() has the expedited
+ * RCU grace periods that it needs.
+ */
+static void sync_rcu_do_polled_gp(struct work_struct *wp)
+{
+ unsigned long flags;
+ int i = 0;
+ struct rcu_node *rnp = container_of(wp, struct rcu_node, exp_poll_wq);
+ unsigned long s;
+
+ raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
+ s = rnp->exp_seq_poll_rq;
+ rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
+ raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
+ if (s == RCU_GET_STATE_COMPLETED)
+ return;
+ while (!poll_state_synchronize_rcu(s)) {
+ synchronize_rcu_expedited();
+ if (i == 10 || i == 20)
+ pr_info("%s: i = %d s = %lx gp_seq_polled = %lx\n", __func__, i, s, READ_ONCE(rcu_state.gp_seq_polled));
+ i++;
+ }
+ raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
+ s = rnp->exp_seq_poll_rq;
+ if (poll_state_synchronize_rcu(s))
+ rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
+ raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
+}
+
+/**
+ * start_poll_synchronize_rcu_expedited - Snapshot current RCU state and start expedited grace period
+ *
+ * Returns a cookie to pass to a call to cond_synchronize_rcu(),
+ * cond_synchronize_rcu_expedited(), or poll_state_synchronize_rcu(),
+ * allowing them to determine whether or not any sort of grace period has
+ * elapsed in the meantime. If the needed expedited grace period is not
+ * already slated to start, initiates that grace period.
+ */
+unsigned long start_poll_synchronize_rcu_expedited(void)
+{
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ unsigned long s;
+
+ s = get_state_synchronize_rcu();
+ rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id());
+ rnp = rdp->mynode;
+ if (rcu_init_invoked())
+ raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
+ if (!poll_state_synchronize_rcu(s)) {
+ rnp->exp_seq_poll_rq = s;
+ if (rcu_init_invoked())
+ queue_work(rcu_gp_wq, &rnp->exp_poll_wq);
+ }
+ if (rcu_init_invoked())
+ raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
+
+ return s;
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited);
+
+/**
+ * cond_synchronize_rcu_expedited - Conditionally wait for an expedited RCU grace period
+ *
+ * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
+ *
+ * If any type of full RCU grace period has elapsed since the earlier
+ * call to get_state_synchronize_rcu(), start_poll_synchronize_rcu(),
+ * or start_poll_synchronize_rcu_expedited(), just return. Otherwise,
+ * invoke synchronize_rcu_expedited() to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for a couple of additional grace periods should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @oldstate and that returned at the end
+ * of this function.
+ */
+void cond_synchronize_rcu_expedited(unsigned long oldstate)
+{
+ if (!poll_state_synchronize_rcu(oldstate))
+ synchronize_rcu_expedited();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 46694e13398a..a8f574d8850d 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -546,52 +546,51 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
}
}
-/*
- * Check if we ignore this rdp.
- *
- * We check that without holding the nocb lock but
- * we make sure not to miss a freshly offloaded rdp
- * with the current ordering:
- *
- * rdp_offload_toggle() nocb_gp_enabled_cb()
- * ------------------------- ----------------------------
- * WRITE flags LOCK nocb_gp_lock
- * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep
- * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock
- * UNLOCK nocb_gp_lock READ flags
- */
-static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp)
-{
- u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP;
-
- return rcu_segcblist_test_flags(&rdp->cblist, flags);
-}
-
-static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp,
- bool *needwake_state)
+static int nocb_gp_toggle_rdp(struct rcu_data *rdp,
+ bool *wake_state)
{
struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long flags;
+ int ret;
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
- rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
- *needwake_state = true;
- }
- return false;
+ rcu_nocb_lock_irqsave(rdp, flags);
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) &&
+ !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
+ /*
+ * Offloading. Set our flag and notify the offload worker.
+ * We will handle this rdp until it ever gets de-offloaded.
+ */
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+ *wake_state = true;
+ ret = 1;
+ } else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) &&
+ rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
+ /*
+ * De-offloading. Clear our flag and notify the de-offload worker.
+ * We will ignore this rdp until it ever gets re-offloaded.
+ */
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+ *wake_state = true;
+ ret = 0;
+ } else {
+ WARN_ON_ONCE(1);
+ ret = -1;
}
- /*
- * De-offloading. Clear our flag and notify the de-offload worker.
- * We will ignore this rdp until it ever gets re-offloaded.
- */
- WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
- rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
- *needwake_state = true;
- return true;
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ return ret;
}
+static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu)
+{
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
+ swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
+ !READ_ONCE(my_rdp->nocb_gp_sleep));
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
+}
/*
* No-CBs GP kthreads come here to wait for additional callbacks to show up
@@ -609,7 +608,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
bool needwait_gp = false; // This prevents actual uninitialized use.
bool needwake;
bool needwake_gp;
- struct rcu_data *rdp;
+ struct rcu_data *rdp, *rdp_toggling = NULL;
struct rcu_node *rnp;
unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
bool wasempty = false;
@@ -634,19 +633,10 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
* is added to the list, so the skipped-over rcu_data structures
* won't be ignored for long.
*/
- list_for_each_entry_rcu(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp, 1) {
- bool needwake_state = false;
-
- if (!nocb_gp_enabled_cb(rdp))
- continue;
+ list_for_each_entry(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp) {
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
rcu_nocb_lock_irqsave(rdp, flags);
- if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake_state)
- swake_up_one(&rdp->nocb_state_wq);
- continue;
- }
+ lockdep_assert_held(&rdp->nocb_lock);
bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
if (bypass_ncbs &&
(time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
@@ -656,8 +646,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
} else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake_state)
- swake_up_one(&rdp->nocb_state_wq);
continue; /* No callbacks here, try next. */
}
if (bypass_ncbs) {
@@ -705,8 +693,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
}
if (needwake_gp)
rcu_gp_kthread_wake();
- if (needwake_state)
- swake_up_one(&rdp->nocb_state_wq);
}
my_rdp->nocb_gp_bypass = bypass;
@@ -723,13 +709,19 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
/* Polling, so trace if first poll in the series. */
if (gotcbs)
trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
- schedule_timeout_idle(1);
+ if (list_empty(&my_rdp->nocb_head_rdp)) {
+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ if (!my_rdp->nocb_toggling_rdp)
+ WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ /* Wait for any offloading rdp */
+ nocb_gp_sleep(my_rdp, cpu);
+ } else {
+ schedule_timeout_idle(1);
+ }
} else if (!needwait_gp) {
/* Wait for callbacks to appear. */
- trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
- swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
- !READ_ONCE(my_rdp->nocb_gp_sleep));
- trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
+ nocb_gp_sleep(my_rdp, cpu);
} else {
rnp = my_rdp->mynode;
trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
@@ -739,15 +731,49 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
!READ_ONCE(my_rdp->nocb_gp_sleep));
trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
}
+
if (!rcu_nocb_poll) {
raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ // (De-)queue an rdp to/from the group if its nocb state is changing
+ rdp_toggling = my_rdp->nocb_toggling_rdp;
+ if (rdp_toggling)
+ my_rdp->nocb_toggling_rdp = NULL;
+
if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
del_timer(&my_rdp->nocb_timer);
}
WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ } else {
+ rdp_toggling = READ_ONCE(my_rdp->nocb_toggling_rdp);
+ if (rdp_toggling) {
+ /*
+ * Paranoid locking to make sure nocb_toggling_rdp is well
+ * reset *before* we (re)set SEGCBLIST_KTHREAD_GP or we could
+ * race with another round of nocb toggling for this rdp.
+ * Nocb locking should prevent from that already but we stick
+ * to paranoia, especially in rare path.
+ */
+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ my_rdp->nocb_toggling_rdp = NULL;
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ }
+ }
+
+ if (rdp_toggling) {
+ bool wake_state = false;
+ int ret;
+
+ ret = nocb_gp_toggle_rdp(rdp_toggling, &wake_state);
+ if (ret == 1)
+ list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp);
+ else if (ret == 0)
+ list_del(&rdp_toggling->nocb_entry_rdp);
+ if (wake_state)
+ swake_up_one(&rdp_toggling->nocb_state_wq);
}
+
my_rdp->nocb_gp_seq = -1;
WARN_ON(signal_pending(current));
}
@@ -966,16 +992,15 @@ static int rdp_offload_toggle(struct rcu_data *rdp,
swake_up_one(&rdp->nocb_cb_wq);
raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ // Queue this rdp for add/del to/from the list to iterate on rcuog
+ WRITE_ONCE(rdp_gp->nocb_toggling_rdp, rdp);
if (rdp_gp->nocb_gp_sleep) {
rdp_gp->nocb_gp_sleep = false;
wake_gp = true;
}
raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
- if (wake_gp)
- wake_up_process(rdp_gp->nocb_gp_kthread);
-
- return 0;
+ return wake_gp;
}
static long rcu_nocb_rdp_deoffload(void *arg)
@@ -983,9 +1008,15 @@ static long rcu_nocb_rdp_deoffload(void *arg)
struct rcu_data *rdp = arg;
struct rcu_segcblist *cblist = &rdp->cblist;
unsigned long flags;
- int ret;
+ int wake_gp;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
- WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
+ /*
+ * rcu_nocb_rdp_deoffload() may be called directly if
+ * rcuog/o[p] spawn failed, because at this time the rdp->cpu
+ * is not online yet.
+ */
+ WARN_ON_ONCE((rdp->cpu != raw_smp_processor_id()) && cpu_online(rdp->cpu));
pr_info("De-offloading %d\n", rdp->cpu);
@@ -1009,12 +1040,41 @@ static long rcu_nocb_rdp_deoffload(void *arg)
*/
rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE);
invoke_rcu_core();
- ret = rdp_offload_toggle(rdp, false, flags);
- swait_event_exclusive(rdp->nocb_state_wq,
- !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
- SEGCBLIST_KTHREAD_GP));
- /* Stop nocb_gp_wait() from iterating over this structure. */
- list_del_rcu(&rdp->nocb_entry_rdp);
+ wake_gp = rdp_offload_toggle(rdp, false, flags);
+
+ mutex_lock(&rdp_gp->nocb_gp_kthread_mutex);
+ if (rdp_gp->nocb_gp_kthread) {
+ if (wake_gp)
+ wake_up_process(rdp_gp->nocb_gp_kthread);
+
+ /*
+ * If rcuo[p] kthread spawn failed, directly remove SEGCBLIST_KTHREAD_CB.
+ * Just wait SEGCBLIST_KTHREAD_GP to be cleared by rcuog.
+ */
+ if (!rdp->nocb_cb_kthread) {
+ rcu_nocb_lock_irqsave(rdp, flags);
+ rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ }
+
+ swait_event_exclusive(rdp->nocb_state_wq,
+ !rcu_segcblist_test_flags(cblist,
+ SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP));
+ } else {
+ /*
+ * No kthread to clear the flags for us or remove the rdp from the nocb list
+ * to iterate. Do it here instead. Locking doesn't look stricly necessary
+ * but we stick to paranoia in this rare path.
+ */
+ rcu_nocb_lock_irqsave(rdp, flags);
+ rcu_segcblist_clear_flags(&rdp->cblist,
+ SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ list_del(&rdp->nocb_entry_rdp);
+ }
+ mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
+
/*
* Lock one last time to acquire latest callback updates from kthreads
* so we can later handle callbacks locally without locking.
@@ -1035,7 +1095,7 @@ static long rcu_nocb_rdp_deoffload(void *arg)
WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
- return ret;
+ return 0;
}
int rcu_nocb_cpu_deoffload(int cpu)
@@ -1043,8 +1103,8 @@ int rcu_nocb_cpu_deoffload(int cpu)
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
int ret = 0;
- mutex_lock(&rcu_state.barrier_mutex);
cpus_read_lock();
+ mutex_lock(&rcu_state.barrier_mutex);
if (rcu_rdp_is_offloaded(rdp)) {
if (cpu_online(cpu)) {
ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp);
@@ -1055,8 +1115,8 @@ int rcu_nocb_cpu_deoffload(int cpu)
ret = -EINVAL;
}
}
- cpus_read_unlock();
mutex_unlock(&rcu_state.barrier_mutex);
+ cpus_read_unlock();
return ret;
}
@@ -1067,7 +1127,8 @@ static long rcu_nocb_rdp_offload(void *arg)
struct rcu_data *rdp = arg;
struct rcu_segcblist *cblist = &rdp->cblist;
unsigned long flags;
- int ret;
+ int wake_gp;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
/*
@@ -1077,17 +1138,10 @@ static long rcu_nocb_rdp_offload(void *arg)
if (!rdp->nocb_gp_rdp)
return -EINVAL;
- pr_info("Offloading %d\n", rdp->cpu);
+ if (WARN_ON_ONCE(!rdp_gp->nocb_gp_kthread))
+ return -EINVAL;
- /*
- * Cause future nocb_gp_wait() invocations to iterate over
- * structure, resetting ->nocb_gp_sleep and waking up the related
- * "rcuog". Since nocb_gp_wait() in turn locks ->nocb_gp_lock
- * before setting ->nocb_gp_sleep again, we are guaranteed to
- * iterate this newly added structure before "rcuog" goes to
- * sleep again.
- */
- list_add_tail_rcu(&rdp->nocb_entry_rdp, &rdp->nocb_gp_rdp->nocb_head_rdp);
+ pr_info("Offloading %d\n", rdp->cpu);
/*
* Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING
@@ -1111,7 +1165,9 @@ static long rcu_nocb_rdp_offload(void *arg)
* WRITE flags READ callbacks
* rcu_nocb_unlock() rcu_nocb_unlock()
*/
- ret = rdp_offload_toggle(rdp, true, flags);
+ wake_gp = rdp_offload_toggle(rdp, true, flags);
+ if (wake_gp)
+ wake_up_process(rdp_gp->nocb_gp_kthread);
swait_event_exclusive(rdp->nocb_state_wq,
rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
@@ -1124,7 +1180,7 @@ static long rcu_nocb_rdp_offload(void *arg)
rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE);
rcu_nocb_unlock_irqrestore(rdp, flags);
- return ret;
+ return 0;
}
int rcu_nocb_cpu_offload(int cpu)
@@ -1132,8 +1188,8 @@ int rcu_nocb_cpu_offload(int cpu)
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
int ret = 0;
- mutex_lock(&rcu_state.barrier_mutex);
cpus_read_lock();
+ mutex_lock(&rcu_state.barrier_mutex);
if (!rcu_rdp_is_offloaded(rdp)) {
if (cpu_online(cpu)) {
ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp);
@@ -1144,8 +1200,8 @@ int rcu_nocb_cpu_offload(int cpu)
ret = -EINVAL;
}
}
- cpus_read_unlock();
mutex_unlock(&rcu_state.barrier_mutex);
+ cpus_read_unlock();
return ret;
}
@@ -1155,11 +1211,21 @@ void __init rcu_init_nohz(void)
{
int cpu;
bool need_rcu_nocb_mask = false;
+ bool offload_all = false;
struct rcu_data *rdp;
+#if defined(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL)
+ if (!rcu_state.nocb_is_setup) {
+ need_rcu_nocb_mask = true;
+ offload_all = true;
+ }
+#endif /* #if defined(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL) */
+
#if defined(CONFIG_NO_HZ_FULL)
- if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask))
+ if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask)) {
need_rcu_nocb_mask = true;
+ offload_all = false; /* NO_HZ_FULL has its own mask. */
+ }
#endif /* #if defined(CONFIG_NO_HZ_FULL) */
if (need_rcu_nocb_mask) {
@@ -1180,6 +1246,9 @@ void __init rcu_init_nohz(void)
cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
#endif /* #if defined(CONFIG_NO_HZ_FULL) */
+ if (offload_all)
+ cpumask_setall(rcu_nocb_mask);
+
if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
cpumask_and(rcu_nocb_mask, cpu_possible_mask,
@@ -1246,7 +1315,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
"rcuog/%d", rdp_gp->cpu);
if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) {
mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
- return;
+ goto end;
}
WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
if (kthread_prio)
@@ -1258,12 +1327,21 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
t = kthread_run(rcu_nocb_cb_kthread, rdp,
"rcuo%c/%d", rcu_state.abbr, cpu);
if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
- return;
+ goto end;
- if (kthread_prio)
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio)
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+
WRITE_ONCE(rdp->nocb_cb_kthread, t);
WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
+ return;
+end:
+ mutex_lock(&rcu_state.barrier_mutex);
+ if (rcu_rdp_is_offloaded(rdp)) {
+ rcu_nocb_rdp_deoffload(rdp);
+ cpumask_clear_cpu(cpu, rcu_nocb_mask);
+ }
+ mutex_unlock(&rcu_state.barrier_mutex);
}
/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c966d680b789..7ae1551479a2 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -460,7 +460,7 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
* be quite short, for example, in the case of the call from
* rcu_read_unlock_special().
*/
-static void
+static notrace void
rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
{
bool empty_exp;
@@ -581,7 +581,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
* is disabled. This function cannot be expected to understand these
* nuances, so the caller must handle them.
*/
-static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
+static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t)
{
return (__this_cpu_read(rcu_data.cpu_no_qs.b.exp) ||
READ_ONCE(t->rcu_read_unlock_special.s)) &&
@@ -595,7 +595,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
* evaluate safety in terms of interrupt, softirq, and preemption
* disabling.
*/
-static void rcu_preempt_deferred_qs(struct task_struct *t)
+static notrace void rcu_preempt_deferred_qs(struct task_struct *t)
{
unsigned long flags;
@@ -926,7 +926,7 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
* Because there is no preemptible RCU, there can be no deferred quiescent
* states.
*/
-static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
+static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t)
{
return false;
}
@@ -935,7 +935,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
// period for a quiescent state from this CPU. Note that requests from
// tasks are handled when removing the task from the blocked-tasks list
// below.
-static void rcu_preempt_deferred_qs(struct task_struct *t)
+static notrace void rcu_preempt_deferred_qs(struct task_struct *t)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
@@ -1012,6 +1012,25 @@ static void rcu_cpu_kthread_setup(unsigned int cpu)
WRITE_ONCE(rdp->rcuc_activity, jiffies);
}
+static bool rcu_is_callbacks_nocb_kthread(struct rcu_data *rdp)
+{
+#ifdef CONFIG_RCU_NOCB_CPU
+ return rdp->nocb_cb_kthread == current;
+#else
+ return false;
+#endif
+}
+
+/*
+ * Is the current CPU running the RCU-callbacks kthread?
+ * Caller must have preemption disabled.
+ */
+static bool rcu_is_callbacks_kthread(struct rcu_data *rdp)
+{
+ return rdp->rcu_cpu_kthread_task == current ||
+ rcu_is_callbacks_nocb_kthread(rdp);
+}
+
#ifdef CONFIG_RCU_BOOST
/*
@@ -1140,7 +1159,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
(rnp->gp_tasks != NULL &&
rnp->boost_tasks == NULL &&
rnp->qsmask == 0 &&
- (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld))) {
+ (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld ||
+ IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)))) {
if (rnp->exp_tasks == NULL)
WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -1151,15 +1171,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
}
}
-/*
- * Is the current CPU running the RCU-callbacks kthread?
- * Caller must have preemption disabled.
- */
-static bool rcu_is_callbacks_kthread(void)
-{
- return __this_cpu_read(rcu_data.rcu_cpu_kthread_task) == current;
-}
-
#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
/*
@@ -1242,11 +1253,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
-static bool rcu_is_callbacks_kthread(void)
-{
- return false;
-}
-
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
{
}
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index fc7fef575606..2e93acad1e31 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -516,6 +516,19 @@ int rcu_cpu_stall_suppress_at_boot __read_mostly; // !0 = suppress boot stalls.
EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress_at_boot);
module_param(rcu_cpu_stall_suppress_at_boot, int, 0444);
+/**
+ * get_completed_synchronize_rcu - Return a pre-completed polled state cookie
+ *
+ * Returns a value that will always be treated by functions like
+ * poll_state_synchronize_rcu() as a cookie whose grace period has already
+ * completed.
+ */
+unsigned long get_completed_synchronize_rcu(void)
+{
+ return RCU_GET_STATE_COMPLETED;
+}
+EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu);
+
#ifdef CONFIG_PROVE_RCU
/*
diff --git a/kernel/smp.c b/kernel/smp.c
index dd215f439426..650810a6f29b 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -174,9 +174,9 @@ static int __init csdlock_debug(char *str)
if (val)
static_branch_enable(&csdlock_debug_enabled);
- return 0;
+ return 1;
}
-early_param("csdlock_debug", csdlock_debug);
+__setup("csdlock_debug=", csdlock_debug);
static DEFINE_PER_CPU(call_single_data_t *, cur_csd);
static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func);
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh b/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh
index f17000a2ccf1..ed0ec7f0927e 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh
@@ -35,7 +35,7 @@ then
exit 1
fi
-# Remember where we started so that we can get back and the end.
+# Remember where we started so that we can get back at the end.
curcommit="`git status | head -1 | awk '{ print $NF }'`"
nfail=0
@@ -73,15 +73,10 @@ do
# Test the specified commit.
git checkout $i > $resdir/$ds/$idir/git-checkout.out 2>&1
echo git checkout return code: $? "(Commit $ntry: $i)"
- kvm.sh --allcpus --duration 3 --trust-make > $resdir/$ds/$idir/kvm.sh.out 2>&1
+ kvm.sh --allcpus --duration 3 --trust-make --datestamp "$ds/$idir" > $resdir/$ds/$idir/kvm.sh.out 2>&1
ret=$?
echo kvm.sh return code $ret for commit $i from branch $gitbr
-
- # Move the build products to their resting place.
- runresdir="`grep -m 1 '^Results directory:' < $resdir/$ds/$idir/kvm.sh.out | sed -e 's/^Results directory://'`"
- mv $runresdir $resdir/$ds/$idir
- rrd="`echo $runresdir | sed -e 's,^.*/,,'`"
- echo Run results: $resdir/$ds/$idir/$rrd
+ echo Run results: $resdir/$ds/$idir
if test "$ret" -ne 0
then
# Failure, so leave all evidence intact.
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh
index 0ff59bd8b640..9f0a5d5ff2dd 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh
@@ -262,6 +262,7 @@ echo All batches started. `date` | tee -a "$oldrun/remote-log"
# Wait for all remaining scenarios to complete and collect results.
for i in $systems
do
+ echo " ---" Waiting for $i `date` | tee -a "$oldrun/remote-log"
while checkremotefile "$i" "$resdir/$ds/remote.run"
do
sleep 30
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index 263e16aeca0e..6c734818a875 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -164,7 +164,7 @@ do
shift
;;
--gdb)
- TORTURE_KCONFIG_GDB_ARG="CONFIG_DEBUG_INFO=y"; export TORTURE_KCONFIG_GDB_ARG
+ TORTURE_KCONFIG_GDB_ARG="CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y"; export TORTURE_KCONFIG_GDB_ARG
TORTURE_BOOT_GDB_ARG="nokaslr"; export TORTURE_BOOT_GDB_ARG
TORTURE_QEMU_GDB_ARG="-s -S"; export TORTURE_QEMU_GDB_ARG
;;
@@ -180,7 +180,7 @@ do
shift
;;
--kasan)
- TORTURE_KCONFIG_KASAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KASAN=y"; export TORTURE_KCONFIG_KASAN_ARG
+ TORTURE_KCONFIG_KASAN_ARG="CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_KASAN=y"; export TORTURE_KCONFIG_KASAN_ARG
if test -n "$torture_qemu_mem_default"
then
TORTURE_QEMU_MEM=2G
@@ -192,7 +192,7 @@ do
shift
;;
--kcsan)
- TORTURE_KCONFIG_KCSAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KCSAN=y CONFIG_KCSAN_STRICT=y CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y"; export TORTURE_KCONFIG_KCSAN_ARG
+ TORTURE_KCONFIG_KCSAN_ARG="CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_KCSAN=y CONFIG_KCSAN_STRICT=y CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y"; export TORTURE_KCONFIG_KCSAN_ARG
;;
--kmake-arg|--kmake-args)
checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$'