diff options
-rw-r--r-- | Documentation/admin-guide/kernel-parameters.txt | 52 | ||||
-rw-r--r-- | include/linux/rcupdate.h | 1 | ||||
-rw-r--r-- | include/linux/rcutiny.h | 21 | ||||
-rw-r--r-- | include/linux/rcutree.h | 2 | ||||
-rw-r--r-- | kernel/rcu/Kconfig | 29 | ||||
-rw-r--r-- | kernel/rcu/Kconfig.debug | 2 | ||||
-rw-r--r-- | kernel/rcu/rcu.h | 15 | ||||
-rw-r--r-- | kernel/rcu/rcuscale.c | 1 | ||||
-rw-r--r-- | kernel/rcu/rcutorture.c | 247 | ||||
-rw-r--r-- | kernel/rcu/refscale.c | 18 | ||||
-rw-r--r-- | kernel/rcu/srcutree.c | 98 | ||||
-rw-r--r-- | kernel/rcu/tiny.c | 25 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 184 | ||||
-rw-r--r-- | kernel/rcu/tree.h | 13 | ||||
-rw-r--r-- | kernel/rcu/tree_exp.h | 113 | ||||
-rw-r--r-- | kernel/rcu/tree_nocb.h | 266 | ||||
-rw-r--r-- | kernel/rcu/tree_plugin.h | 46 | ||||
-rw-r--r-- | kernel/rcu/update.c | 13 | ||||
-rw-r--r-- | kernel/smp.c | 4 | ||||
-rwxr-xr-x | tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh | 11 | ||||
-rwxr-xr-x | tools/testing/selftests/rcutorture/bin/kvm-remote.sh | 1 | ||||
-rwxr-xr-x | tools/testing/selftests/rcutorture/bin/kvm.sh | 6 |
22 files changed, 870 insertions, 298 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2522b11e593f..4cd3ca5d09a8 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3659,6 +3659,9 @@ just as if they had also been called out in the rcu_nocbs= boot parameter. + Note that this argument takes precedence over + the CONFIG_RCU_NOCB_CPU_DEFAULT_ALL option. + noiotrap [SH] Disables trapped I/O port accesses. noirqdebug [X86-32] Disables the code which attempts to detect and @@ -4557,6 +4560,9 @@ no-callback mode from boot but the mode may be toggled at runtime via cpusets. + Note that this argument takes precedence over + the CONFIG_RCU_NOCB_CPU_DEFAULT_ALL option. + rcu_nocb_poll [KNL] Rather than requiring that offloaded CPUs (specified by rcu_nocbs= above) explicitly @@ -4666,6 +4672,34 @@ When RCU_NOCB_CPU is set, also adjust the priority of NOCB callback kthreads. + rcutree.rcu_divisor= [KNL] + Set the shift-right count to use to compute + the callback-invocation batch limit bl from + the number of callbacks queued on this CPU. + The result will be bounded below by the value of + the rcutree.blimit kernel parameter. Every bl + callbacks, the softirq handler will exit in + order to allow the CPU to do other work. + + Please note that this callback-invocation batch + limit applies only to non-offloaded callback + invocation. Offloaded callbacks are instead + invoked in the context of an rcuoc kthread, which + scheduler will preempt as it does any other task. + + rcutree.nocb_nobypass_lim_per_jiffy= [KNL] + On callback-offloaded (rcu_nocbs) CPUs, + RCU reduces the lock contention that would + otherwise be caused by callback floods through + use of the ->nocb_bypass list. However, in the + common non-flooded case, RCU queues directly to + the main ->cblist in order to avoid the extra + overhead of the ->nocb_bypass list and its lock. + But if there are too many callbacks queued during + a single jiffy, RCU pre-queues the callbacks into + the ->nocb_bypass queue. The definition of "too + many" is supplied by this kernel boot parameter. + rcutree.rcu_nocb_gp_stride= [KNL] Set the number of NOCB callback kthreads in each group, which defaults to the square root @@ -5771,6 +5805,24 @@ expediting. Set to zero to disable automatic expediting. + srcutree.srcu_max_nodelay [KNL] + Specifies the number of no-delay instances + per jiffy for which the SRCU grace period + worker thread will be rescheduled with zero + delay. Beyond this limit, worker thread will + be rescheduled with a sleep delay of one jiffy. + + srcutree.srcu_max_nodelay_phase [KNL] + Specifies the per-grace-period phase, number of + non-sleeping polls of readers. Beyond this limit, + grace period worker thread will be rescheduled + with a sleep delay of one jiffy, between each + rescan of the readers, for a grace period phase. + + srcutree.srcu_retry_check_delay [KNL] + Specifies number of microseconds of non-sleeping + delay between each non-sleeping poll of readers. + srcutree.small_contention_lim [KNL] Specifies the number of update-side contention events per jiffy will be tolerated before diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index ebdfeead44e5..937a58b3266b 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -41,6 +41,7 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func); void rcu_barrier_tasks(void); void rcu_barrier_tasks_rude(void); void synchronize_rcu(void); +unsigned long get_completed_synchronize_rcu(void); #ifdef CONFIG_PREEMPT_RCU diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 5fed476f977f..e6bb31a0927b 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -23,6 +23,16 @@ static inline void cond_synchronize_rcu(unsigned long oldstate) might_sleep(); } +static inline unsigned long start_poll_synchronize_rcu_expedited(void) +{ + return start_poll_synchronize_rcu(); +} + +static inline void cond_synchronize_rcu_expedited(unsigned long oldstate) +{ + cond_synchronize_rcu(oldstate); +} + extern void rcu_barrier(void); static inline void synchronize_rcu_expedited(void) @@ -38,7 +48,7 @@ static inline void synchronize_rcu_expedited(void) */ extern void kvfree(const void *addr); -static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +static inline void __kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { if (head) { call_rcu(head, func); @@ -51,6 +61,15 @@ static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) kvfree((void *) func); } +#ifdef CONFIG_KASAN_GENERIC +void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func); +#else +static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +{ + __kvfree_call_rcu(head, func); +} +#endif + void rcu_qs(void); static inline void rcu_softirq_qs(void) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 9c6cfb742504..20dbaa9a3882 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -40,6 +40,8 @@ bool rcu_eqs_special_set(int cpu); void rcu_momentary_dyntick_idle(void); void kfree_rcu_scheduler_running(void); bool rcu_gp_might_be_stalled(void); +unsigned long start_poll_synchronize_rcu_expedited(void); +void cond_synchronize_rcu_expedited(unsigned long oldstate); unsigned long get_state_synchronize_rcu(void); unsigned long start_poll_synchronize_rcu(void); bool poll_state_synchronize_rcu(unsigned long oldstate); diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 1c630e573548..c05ca52cdf64 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -262,6 +262,35 @@ config RCU_NOCB_CPU Say Y here if you need reduced OS jitter, despite added overhead. Say N here if you are unsure. +config RCU_NOCB_CPU_DEFAULT_ALL + bool "Offload RCU callback processing from all CPUs by default" + depends on RCU_NOCB_CPU + default n + help + Use this option to offload callback processing from all CPUs + by default, in the absence of the rcu_nocbs or nohz_full boot + parameter. This also avoids the need to use any boot parameters + to achieve the effect of offloading all CPUs on boot. + + Say Y here if you want offload all CPUs by default on boot. + Say N here if you are unsure. + +config RCU_NOCB_CPU_CB_BOOST + bool "Offload RCU callback from real-time kthread" + depends on RCU_NOCB_CPU && RCU_BOOST + default y if PREEMPT_RT + help + Use this option to invoke offloaded callbacks as SCHED_FIFO + to avoid starvation by heavy SCHED_OTHER background load. + Of course, running as SCHED_FIFO during callback floods will + cause the rcuo[ps] kthreads to monopolize the CPU for hundreds + of milliseconds or more. Therefore, when enabling this option, + it is your responsibility to ensure that latency-sensitive + tasks either run with higher priority or run on some other CPU. + + Say Y here if you want to set RT priority for offloading kthreads. + Say N here if you are building a !PREEMPT_RT kernel and are unsure. + config TASKS_TRACE_RCU_READ_MB bool "Tasks Trace RCU readers use memory barriers in user and idle" depends on RCU_EXPERT && TASKS_TRACE_RCU diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 9b64e55d4f61..4da05beb13d7 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -121,7 +121,7 @@ config RCU_EQS_DEBUG config RCU_STRICT_GRACE_PERIOD bool "Provide debug RCU implementation with short grace periods" - depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4 + depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4 && !TINY_RCU default n select PREEMPT_COUNT if PREEMPT=n help diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 4916077119f3..32291f4eefde 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -23,6 +23,9 @@ #define RCU_SEQ_CTR_SHIFT 2 #define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) +/* Low-order bit definition for polled grace-period APIs. */ +#define RCU_GET_STATE_COMPLETED 0x1 + extern int sysctl_sched_rt_runtime; /* @@ -120,6 +123,18 @@ static inline bool rcu_seq_done(unsigned long *sp, unsigned long s) } /* + * Given a snapshot from rcu_seq_snap(), determine whether or not a + * full update-side operation has occurred, but do not allow the + * (ULONG_MAX / 2) safety-factor/guard-band. + */ +static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s) +{ + unsigned long cur_s = READ_ONCE(*sp); + + return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_STATE_MASK + 1)); +} + +/* * Has a grace period completed since the time the old gp_seq was collected? */ static inline bool rcu_seq_completed_gp(unsigned long old, unsigned long new) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 277a5bfb37d4..3ef02d4a8108 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -419,6 +419,7 @@ rcu_scale_writer(void *arg) VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started"); WARN_ON(!wdpp); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + current->flags |= PF_NO_SETAFFINITY; sched_set_fifo_low(current); if (holdoff) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7120165a9342..d8e1b270a065 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -75,62 +75,47 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@ torture_param(int, extendables, RCUTORTURE_MAX_EXTEND, "Extend readers by disabling bh (1), irqs (2), or preempt (4)"); -torture_param(int, fqs_duration, 0, - "Duration of fqs bursts (us), 0 to disable"); +torture_param(int, fqs_duration, 0, "Duration of fqs bursts (us), 0 to disable"); torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)"); -torture_param(int, fwd_progress, 1, "Test grace-period forward progress"); +torture_param(int, fwd_progress, 1, "Number of grace-period forward progress tasks (0 to disable)"); torture_param(int, fwd_progress_div, 4, "Fraction of CPU stall to wait"); -torture_param(int, fwd_progress_holdoff, 60, - "Time between forward-progress tests (s)"); -torture_param(bool, fwd_progress_need_resched, 1, - "Hide cond_resched() behind need_resched()"); +torture_param(int, fwd_progress_holdoff, 60, "Time between forward-progress tests (s)"); +torture_param(bool, fwd_progress_need_resched, 1, "Hide cond_resched() behind need_resched()"); torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives"); +torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait primitives"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); -torture_param(bool, gp_normal, false, - "Use normal (non-expedited) GP wait primitives"); +torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives"); torture_param(bool, gp_poll, false, "Use polling GP wait primitives"); +torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives"); torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers"); -torture_param(int, n_barrier_cbs, 0, - "# of callbacks/kthreads for barrier testing"); +torture_param(int, n_barrier_cbs, 0, "# of callbacks/kthreads for barrier testing"); torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); -torture_param(int, object_debug, 0, - "Enable debug-object double call_rcu() testing"); +torture_param(int, object_debug, 0, "Enable debug-object double call_rcu() testing"); torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); -torture_param(int, onoff_interval, 0, - "Time between CPU hotplugs (jiffies), 0=disable"); +torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable"); torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disable"); torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)"); -torture_param(int, read_exit_delay, 13, - "Delay between read-then-exit episodes (s)"); -torture_param(int, read_exit_burst, 16, - "# of read-then-exit bursts per episode, zero to disable"); +torture_param(int, read_exit_delay, 13, "Delay between read-then-exit episodes (s)"); +torture_param(int, read_exit_burst, 16, "# of read-then-exit bursts per episode, zero to disable"); torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles"); torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); -torture_param(int, stall_cpu_holdoff, 10, - "Time to wait before starting stall (s)."); -torture_param(bool, stall_no_softlockup, false, - "Avoid softlockup warning during cpu stall."); +torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s)."); +torture_param(bool, stall_no_softlockup, false, "Avoid softlockup warning during cpu stall."); torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling."); torture_param(int, stall_cpu_block, 0, "Sleep while stalling."); -torture_param(int, stall_gp_kthread, 0, - "Grace-period kthread stall duration (s)."); -torture_param(int, stat_interval, 60, - "Number of seconds between stats printk()s"); +torture_param(int, stall_gp_kthread, 0, "Grace-period kthread stall duration (s)."); +torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of seconds to run/halt test"); torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); -torture_param(int, test_boost_duration, 4, - "Duration of each boost test, seconds."); -torture_param(int, test_boost_interval, 7, - "Interval between boost tests, seconds."); -torture_param(bool, test_no_idle_hz, true, - "Test support for tickless idle CPUs"); -torture_param(int, verbose, 1, - "Enable verbose debugging printk()s"); +torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds."); +torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds."); +torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs"); +torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); static char *torture_type = "rcu"; module_param(torture_type, charp, 0444); @@ -209,12 +194,16 @@ static int rcu_torture_writer_state; #define RTWS_DEF_FREE 3 #define RTWS_EXP_SYNC 4 #define RTWS_COND_GET 5 -#define RTWS_COND_SYNC 6 -#define RTWS_POLL_GET 7 -#define RTWS_POLL_WAIT 8 -#define RTWS_SYNC 9 -#define RTWS_STUTTER 10 -#define RTWS_STOPPING 11 +#define RTWS_COND_GET_EXP 6 +#define RTWS_COND_SYNC 7 +#define RTWS_COND_SYNC_EXP 8 +#define RTWS_POLL_GET 9 +#define RTWS_POLL_GET_EXP 10 +#define RTWS_POLL_WAIT 11 +#define RTWS_POLL_WAIT_EXP 12 +#define RTWS_SYNC 13 +#define RTWS_STUTTER 14 +#define RTWS_STOPPING 15 static const char * const rcu_torture_writer_state_names[] = { "RTWS_FIXED_DELAY", "RTWS_DELAY", @@ -222,9 +211,13 @@ static const char * const rcu_torture_writer_state_names[] = { "RTWS_DEF_FREE", "RTWS_EXP_SYNC", "RTWS_COND_GET", + "RTWS_COND_GET_EXP", "RTWS_COND_SYNC", + "RTWS_COND_SYNC_EXP", "RTWS_POLL_GET", + "RTWS_POLL_GET_EXP", "RTWS_POLL_WAIT", + "RTWS_POLL_WAIT_EXP", "RTWS_SYNC", "RTWS_STUTTER", "RTWS_STOPPING", @@ -337,7 +330,12 @@ struct rcu_torture_ops { void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); + unsigned long (*get_gp_state_exp)(void); + unsigned long (*start_gp_poll_exp)(void); + bool (*poll_gp_state_exp)(unsigned long oldstate); + void (*cond_sync_exp)(unsigned long oldstate); unsigned long (*get_gp_state)(void); + unsigned long (*get_gp_completed)(void); unsigned long (*start_gp_poll)(void); bool (*poll_gp_state)(unsigned long oldstate); void (*cond_sync)(unsigned long oldstate); @@ -504,9 +502,14 @@ static struct rcu_torture_ops rcu_ops = { .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, .get_gp_state = get_state_synchronize_rcu, + .get_gp_completed = get_completed_synchronize_rcu, .start_gp_poll = start_poll_synchronize_rcu, .poll_gp_state = poll_state_synchronize_rcu, .cond_sync = cond_synchronize_rcu, + .get_gp_state_exp = get_state_synchronize_rcu, + .start_gp_poll_exp = start_poll_synchronize_rcu_expedited, + .poll_gp_state_exp = poll_state_synchronize_rcu, + .cond_sync_exp = cond_synchronize_rcu_expedited, .call = call_rcu, .cb_barrier = rcu_barrier, .fqs = rcu_force_quiescent_state, @@ -1136,9 +1139,8 @@ rcu_torture_fqs(void *arg) return 0; } -// Used by writers to randomly choose from the available grace-period -// primitives. The only purpose of the initialization is to size the array. -static int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, RTWS_COND_GET, RTWS_POLL_GET, RTWS_SYNC }; +// Used by writers to randomly choose from the available grace-period primitives. +static int synctype[ARRAY_SIZE(rcu_torture_writer_state_names)] = { }; static int nsynctypes; /* @@ -1146,18 +1148,27 @@ static int nsynctypes; */ static void rcu_torture_write_types(void) { - bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; - bool gp_poll1 = gp_poll, gp_sync1 = gp_sync; + bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_exp1 = gp_exp; + bool gp_poll_exp1 = gp_poll_exp, gp_normal1 = gp_normal, gp_poll1 = gp_poll; + bool gp_sync1 = gp_sync; /* Initialize synctype[] array. If none set, take default. */ - if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_poll1 && !gp_sync1) - gp_cond1 = gp_exp1 = gp_normal1 = gp_poll1 = gp_sync1 = true; + if (!gp_cond1 && !gp_cond_exp1 && !gp_exp1 && !gp_poll_exp && + !gp_normal1 && !gp_poll1 && !gp_sync1) + gp_cond1 = gp_cond_exp1 = gp_exp1 = gp_poll_exp1 = + gp_normal1 = gp_poll1 = gp_sync1 = true; if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) { synctype[nsynctypes++] = RTWS_COND_GET; pr_info("%s: Testing conditional GPs.\n", __func__); } else if (gp_cond && (!cur_ops->get_gp_state || !cur_ops->cond_sync)) { pr_alert("%s: gp_cond without primitives.\n", __func__); } + if (gp_cond_exp1 && cur_ops->get_gp_state_exp && cur_ops->cond_sync_exp) { + synctype[nsynctypes++] = RTWS_COND_GET_EXP; + pr_info("%s: Testing conditional expedited GPs.\n", __func__); + } else if (gp_cond_exp && (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp)) { + pr_alert("%s: gp_cond_exp without primitives.\n", __func__); + } if (gp_exp1 && cur_ops->exp_sync) { synctype[nsynctypes++] = RTWS_EXP_SYNC; pr_info("%s: Testing expedited GPs.\n", __func__); @@ -1176,6 +1187,12 @@ static void rcu_torture_write_types(void) } else if (gp_poll && (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)) { pr_alert("%s: gp_poll without primitives.\n", __func__); } + if (gp_poll_exp1 && cur_ops->start_gp_poll_exp && cur_ops->poll_gp_state_exp) { + synctype[nsynctypes++] = RTWS_POLL_GET_EXP; + pr_info("%s: Testing polling expedited GPs.\n", __func__); + } else if (gp_poll_exp && (!cur_ops->start_gp_poll_exp || !cur_ops->poll_gp_state_exp)) { + pr_alert("%s: gp_poll_exp without primitives.\n", __func__); + } if (gp_sync1 && cur_ops->sync) { synctype[nsynctypes++] = RTWS_SYNC; pr_info("%s: Testing normal GPs.\n", __func__); @@ -1254,6 +1271,10 @@ rcu_torture_writer(void *arg) rcu_torture_writer_state_getname(), rcu_torture_writer_state, cookie, cur_ops->get_gp_state()); + if (cur_ops->get_gp_completed) { + cookie = cur_ops->get_gp_completed(); + WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie)); + } cur_ops->readunlock(idx); } switch (synctype[torture_random(&rand) % nsynctypes]) { @@ -1263,7 +1284,12 @@ rcu_torture_writer(void *arg) break; case RTWS_EXP_SYNC: rcu_torture_writer_state = RTWS_EXP_SYNC; + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + cookie = cur_ops->get_gp_state(); cur_ops->exp_sync(); + cur_ops->exp_sync(); + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie)); rcu_torture_pipe_update(old_rp); break; case RTWS_COND_GET: @@ -1274,6 +1300,14 @@ rcu_torture_writer(void *arg) cur_ops->cond_sync(gp_snap); rcu_torture_pipe_update(old_rp); break; + case RTWS_COND_GET_EXP: + rcu_torture_writer_state = RTWS_COND_GET_EXP; + gp_snap = cur_ops->get_gp_state_exp(); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + rcu_torture_writer_state = RTWS_COND_SYNC_EXP; + cur_ops->cond_sync_exp(gp_snap); + rcu_torture_pipe_update(old_rp); + break; case RTWS_POLL_GET: rcu_torture_writer_state = RTWS_POLL_GET; gp_snap = cur_ops->start_gp_poll(); @@ -1283,9 +1317,23 @@ rcu_torture_writer(void *arg) &rand); rcu_torture_pipe_update(old_rp); break; + case RTWS_POLL_GET_EXP: + rcu_torture_writer_state = RTWS_POLL_GET_EXP; + gp_snap = cur_ops->start_gp_poll_exp(); + rcu_torture_writer_state = RTWS_POLL_WAIT_EXP; + while (!cur_ops->poll_gp_state_exp(gp_snap)) + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + rcu_torture_pipe_update(old_rp); + break; case RTWS_SYNC: rcu_torture_writer_state = RTWS_SYNC; + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + cookie = cur_ops->get_gp_state(); cur_ops->sync(); + cur_ops->sync(); + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie)); rcu_torture_pipe_update(old_rp); break; default: @@ -1321,8 +1369,9 @@ rcu_torture_writer(void *arg) if (list_empty(&rcu_tortures[i].rtort_free) && rcu_access_pointer(rcu_torture_current) != &rcu_tortures[i]) { - rcu_ftrace_dump(DUMP_ALL); + tracing_off(); WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); + rcu_ftrace_dump(DUMP_ALL); } if (stutter_waited) sched_set_normal(current, oldnice); @@ -1384,6 +1433,11 @@ rcu_torture_fakewriter(void *arg) torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); cur_ops->cond_sync(gp_snap); break; + case RTWS_COND_GET_EXP: + gp_snap = cur_ops->get_gp_state_exp(); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + cur_ops->cond_sync_exp(gp_snap); + break; case RTWS_POLL_GET: gp_snap = cur_ops->start_gp_poll(); while (!cur_ops->poll_gp_state(gp_snap)) { @@ -1391,6 +1445,13 @@ rcu_torture_fakewriter(void *arg) &rand); } break; + case RTWS_POLL_GET_EXP: + gp_snap = cur_ops->start_gp_poll_exp(); + while (!cur_ops->poll_gp_state_exp(gp_snap)) { + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + } + break; case RTWS_SYNC: cur_ops->sync(); break; @@ -1868,7 +1929,7 @@ rcu_torture_stats_print(void) batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]); } } - for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { + for (i = RCU_TORTURE_PIPE_LEN; i >= 0; i--) { if (pipesummary[i] != 0) break; } @@ -1990,7 +2051,13 @@ static void rcu_torture_mem_dump_obj(void) static int z; kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL); + if (WARN_ON_ONCE(!kcp)) + return; rhp = kmem_cache_alloc(kcp, GFP_KERNEL); + if (WARN_ON_ONCE(!rhp)) { + kmem_cache_destroy(kcp); + return; + } pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z); pr_alert("mem_dump_obj(ZERO_SIZE_PTR):"); mem_dump_obj(ZERO_SIZE_PTR); @@ -2007,6 +2074,8 @@ static void rcu_torture_mem_dump_obj(void) kmem_cache_free(kcp, rhp); kmem_cache_destroy(kcp); rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); + if (WARN_ON_ONCE(!rhp)) + return; pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp); pr_alert("mem_dump_obj(kmalloc %px):", rhp); mem_dump_obj(rhp); @@ -2014,6 +2083,8 @@ static void rcu_torture_mem_dump_obj(void) mem_dump_obj(&rhp->func); kfree(rhp); rhp = vmalloc(4096); + if (WARN_ON_ONCE(!rhp)) + return; pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp); pr_alert("mem_dump_obj(vmalloc %px):", rhp); mem_dump_obj(rhp); @@ -2075,6 +2146,19 @@ static int rcutorture_booster_init(unsigned int cpu) if (boost_tasks[cpu] != NULL) return 0; /* Already created, nothing more to do. */ + // Testing RCU priority boosting requires rcutorture do + // some serious abuse. Counter this by running ksoftirqd + // at higher priority. + if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) { + struct sched_param sp; + struct task_struct *t; + + t = per_cpu(ksoftirqd, cpu); + WARN_ON_ONCE(!t); + sp.sched_priority = 2; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + } + /* Don't allow time recalculation while creating a new task. */ mutex_lock(&boost_mutex); rcu_torture_disable_rt_throttle(); @@ -2873,7 +2957,6 @@ static int rcu_torture_read_exit_child(void *trsp_in) // Parent kthread which creates and destroys read-exit child kthreads. static int rcu_torture_read_exit(void *unused) { - int count = 0; bool errexit = false; int i; struct task_struct *tsp; @@ -2885,34 +2968,28 @@ static int rcu_torture_read_exit(void *unused) // Each pass through this loop does one read-exit episode. do { - if (++count > read_exit_burst) { - VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode"); - rcu_barrier(); // Wait for task_struct free, avoid OOM. - for (i = 0; i < read_exit_delay; i++) { - schedule_timeout_uninterruptible(HZ); - if (READ_ONCE(read_exit_child_stop)) - break; + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode"); + for (i = 0; i < read_exit_burst; i++) { + if (READ_ONCE(read_exit_child_stop)) + break; + stutter_wait("rcu_torture_read_exit"); + // Spawn child. + tsp = kthread_run(rcu_torture_read_exit_child, + &trs, "%s", "rcu_torture_read_exit_child"); + if (IS_ERR(tsp)) { + TOROUT_ERRSTRING("out of memory"); + errexit = true; + break; } - if (!READ_ONCE(read_exit_child_stop)) - VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode"); - count = 0; - } - if (READ_ONCE(read_exit_child_stop)) - break; - // Spawn child. - tsp = kthread_run(rcu_torture_read_exit_child, - &trs, "%s", - "rcu_torture_read_exit_child"); - if (IS_ERR(tsp)) { - TOROUT_ERRSTRING("out of memory"); - errexit = true; - tsp = NULL; - break; + cond_resched(); + kthread_stop(tsp); + n_read_exits++; } - cond_resched(); - kthread_stop(tsp); - n_read_exits ++; - stutter_wait("rcu_torture_read_exit"); + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode"); + rcu_barrier(); // Wait for task_struct free, avoid OOM. + i = 0; + for (; !errexit && !READ_ONCE(read_exit_child_stop) && i < read_exit_delay; i++) + schedule_timeout_uninterruptible(HZ); } while (!errexit && !READ_ONCE(read_exit_child_stop)); // Clean up and exit. @@ -3122,6 +3199,7 @@ static void rcu_test_debug_objects(void) pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME); destroy_rcu_head_on_stack(&rh1); destroy_rcu_head_on_stack(&rh2); + kfree(rhp); #else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME); #endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ @@ -3329,21 +3407,6 @@ rcu_torture_init(void) rcutor_hp = firsterr; if (torture_init_error(firsterr)) goto unwind; - - // Testing RCU priority boosting requires rcutorture do - // some serious abuse. Counter this by running ksoftirqd - // at higher priority. - if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) { - for_each_online_cpu(cpu) { - struct sched_param sp; - struct task_struct *t; - - t = per_cpu(ksoftirqd, cpu); - WARN_ON_ONCE(!t); - sp.sched_priority = 2; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - } - } } shutdown_jiffies = jiffies + shutdown_secs * HZ; firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 909644abee67..435c884c02b5 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -385,7 +385,7 @@ static struct ref_scale_ops rwsem_ops = { }; // Definitions for global spinlock -static DEFINE_SPINLOCK(test_lock); +static DEFINE_RAW_SPINLOCK(test_lock); static void ref_lock_section(const int nloops) { @@ -393,8 +393,8 @@ static void ref_lock_section(const int nloops) preempt_disable(); for (i = nloops; i >= 0; i--) { - spin_lock(&test_lock); - spin_unlock(&test_lock); + raw_spin_lock(&test_lock); + raw_spin_unlock(&test_lock); } preempt_enable(); } @@ -405,9 +405,9 @@ static void ref_lock_delay_section(const int nloops, const int udl, const int nd preempt_disable(); for (i = nloops; i >= 0; i--) { - spin_lock(&test_lock); + raw_spin_lock(&test_lock); un_delay(udl, ndl); - spin_unlock(&test_lock); + raw_spin_unlock(&test_lock); } preempt_enable(); } @@ -427,8 +427,8 @@ static void ref_lock_irq_section(const int nloops) preempt_disable(); for (i = nloops; i >= 0; i--) { - spin_lock_irqsave(&test_lock, flags); - spin_unlock_irqrestore(&test_lock, flags); + raw_spin_lock_irqsave(&test_lock, flags); + raw_spin_unlock_irqrestore(&test_lock, flags); } preempt_enable(); } @@ -440,9 +440,9 @@ static void ref_lock_irq_delay_section(const int nloops, const int udl, const in preempt_disable(); for (i = nloops; i >= 0; i--) { - spin_lock_irqsave(&test_lock, flags); + raw_spin_lock_irqsave(&test_lock, flags); un_delay(udl, ndl); - spin_unlock_irqrestore(&test_lock, flags); + raw_spin_unlock_irqrestore(&test_lock, flags); } preempt_enable(); } diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 50ba70f019de..1c304fec89c0 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -511,10 +511,52 @@ static bool srcu_readers_active(struct srcu_struct *ssp) return sum; } -#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending. -#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers. -#define SRCU_MAX_NODELAY_PHASE 1 // Maximum per-GP-phase consecutive no-delay instances. -#define SRCU_MAX_NODELAY 100 // Maximum consecutive no-delay instances. +/* + * We use an adaptive strategy for synchronize_srcu() and especially for + * synchronize_srcu_expedited(). We spin for a fixed time period + * (defined below, boot time configurable) to allow SRCU readers to exit + * their read-side critical sections. If there are still some readers + * after one jiffy, we repeatedly block for one jiffy time periods. + * The blocking time is increased as the grace-period age increases, + * with max blocking time capped at 10 jiffies. + */ +#define SRCU_DEFAULT_RETRY_CHECK_DELAY 5 + +static ulong srcu_retry_check_delay = SRCU_DEFAULT_RETRY_CHECK_DELAY; +module_param(srcu_retry_check_delay, ulong, 0444); + +#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending. +#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers. + +#define SRCU_DEFAULT_MAX_NODELAY_PHASE_LO 3UL // Lowmark on default per-GP-phase + // no-delay instances. +#define SRCU_DEFAULT_MAX_NODELAY_PHASE_HI 1000UL // Highmark on default per-GP-phase + // no-delay instances. + +#define SRCU_UL_CLAMP_LO(val, low) ((val) > (low) ? (val) : (low)) +#define SRCU_UL_CLAMP_HI(val, high) ((val) < (high) ? (val) : (high)) +#define SRCU_UL_CLAMP(val, low, high) SRCU_UL_CLAMP_HI(SRCU_UL_CLAMP_LO((val), (low)), (high)) +// per-GP-phase no-delay instances adjusted to allow non-sleeping poll upto +// one jiffies time duration. Mult by 2 is done to factor in the srcu_get_delay() +// called from process_srcu(). +#define SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED \ + (2UL * USEC_PER_SEC / HZ / SRCU_DEFAULT_RETRY_CHECK_DELAY) + +// Maximum per-GP-phase consecutive no-delay instances. +#define SRCU_DEFAULT_MAX_NODELAY_PHASE \ + SRCU_UL_CLAMP(SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED, \ + SRCU_DEFAULT_MAX_NODELAY_PHASE_LO, \ + SRCU_DEFAULT_MAX_NODELAY_PHASE_HI) + +static ulong srcu_max_nodelay_phase = SRCU_DEFAULT_MAX_NODELAY_PHASE; +module_param(srcu_max_nodelay_phase, ulong, 0444); + +// Maximum consecutive no-delay instances. +#define SRCU_DEFAULT_MAX_NODELAY (SRCU_DEFAULT_MAX_NODELAY_PHASE > 100 ? \ + SRCU_DEFAULT_MAX_NODELAY_PHASE : 100) + +static ulong srcu_max_nodelay = SRCU_DEFAULT_MAX_NODELAY; +module_param(srcu_max_nodelay, ulong, 0444); /* * Return grace-period delay, zero if there are expedited grace @@ -522,16 +564,22 @@ static bool srcu_readers_active(struct srcu_struct *ssp) */ static unsigned long srcu_get_delay(struct srcu_struct *ssp) { + unsigned long gpstart; + unsigned long j; unsigned long jbase = SRCU_INTERVAL; if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) jbase = 0; - if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) - jbase += jiffies - READ_ONCE(ssp->srcu_gp_start); - if (!jbase) { - WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1); - if (READ_ONCE(ssp->srcu_n_exp_nodelay) > SRCU_MAX_NODELAY_PHASE) - jbase = 1; + if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) { + j = jiffies - 1; + gpstart = READ_ONCE(ssp->srcu_gp_start); + if (time_after(j, gpstart)) + jbase += j - gpstart; + if (!jbase) { + WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1); + if (READ_ONCE(ssp->srcu_n_exp_nodelay) > srcu_max_nodelay_phase) + jbase = 1; + } } return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase; } @@ -607,15 +655,6 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) EXPORT_SYMBOL_GPL(__srcu_read_unlock); /* - * We use an adaptive strategy for synchronize_srcu() and especially for - * synchronize_srcu_expedited(). We spin for a fixed time period - * (defined below) to allow SRCU readers to exit their read-side critical - * sections. If there are still some readers after a few microseconds, - * we repeatedly block for 1-millisecond time periods. - */ -#define SRCU_RETRY_CHECK_DELAY 5 - -/* * Start an SRCU grace period. */ static void srcu_gp_start(struct srcu_struct *ssp) @@ -700,7 +739,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp */ static void srcu_gp_end(struct srcu_struct *ssp) { - unsigned long cbdelay; + unsigned long cbdelay = 1; bool cbs; bool last_lvl; int cpu; @@ -720,7 +759,9 @@ static void srcu_gp_end(struct srcu_struct *ssp) spin_lock_irq_rcu_node(ssp); idx = rcu_seq_state(ssp->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); - cbdelay = !!srcu_get_delay(ssp); + if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) + cbdelay = 0; + WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns()); rcu_seq_end(&ssp->srcu_gp_seq); gpseq = rcu_seq_current(&ssp->srcu_gp_seq); @@ -921,12 +962,16 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, */ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) { + unsigned long curdelay; + + curdelay = !srcu_get_delay(ssp); + for (;;) { if (srcu_readers_active_idx_check(ssp, idx)) return true; - if (--trycount + !srcu_get_delay(ssp) <= 0) + if ((--trycount + curdelay) <= 0) return false; - udelay(SRCU_RETRY_CHECK_DELAY); + udelay(srcu_retry_check_delay); } } @@ -1582,7 +1627,7 @@ static void process_srcu(struct work_struct *work) j = jiffies; if (READ_ONCE(ssp->reschedule_jiffies) == j) { WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1); - if (READ_ONCE(ssp->reschedule_count) > SRCU_MAX_NODELAY) + if (READ_ONCE(ssp->reschedule_count) > srcu_max_nodelay) curdelay = 1; } else { WRITE_ONCE(ssp->reschedule_count, 1); @@ -1674,6 +1719,11 @@ static int __init srcu_bootup_announce(void) pr_info("Hierarchical SRCU implementation.\n"); if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF) pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff); + if (srcu_retry_check_delay != SRCU_DEFAULT_RETRY_CHECK_DELAY) + pr_info("\tNon-default retry check delay of %lu us.\n", srcu_retry_check_delay); + if (srcu_max_nodelay != SRCU_DEFAULT_MAX_NODELAY) + pr_info("\tNon-default max no-delay of %lu.\n", srcu_max_nodelay); + pr_info("\tMax phase no-delay instances is %lu.\n", srcu_max_nodelay_phase); return 0; } early_initcall(srcu_bootup_announce); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 340b3f8b090d..f0561ee16b9c 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -58,7 +58,7 @@ void rcu_qs(void) rcu_ctrlblk.donetail = rcu_ctrlblk.curtail; raise_softirq_irqoff(RCU_SOFTIRQ); } - WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 1); + WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2); local_irq_restore(flags); } @@ -139,8 +139,10 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused /* * Wait for a grace period to elapse. But it is illegal to invoke * synchronize_rcu() from within an RCU read-side critical section. - * Therefore, any legal call to synchronize_rcu() is a quiescent - * state, and so on a UP system, synchronize_rcu() need do nothing. + * Therefore, any legal call to synchronize_rcu() is a quiescent state, + * and so on a UP system, synchronize_rcu() need do nothing, other than + * let the polled APIs know that another grace period elapsed. + * * (But Lai Jiangshan points out the benefits of doing might_sleep() * to reduce latency.) * @@ -152,6 +154,7 @@ void synchronize_rcu(void) lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu() in RCU read-side critical section"); + WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2); } EXPORT_SYMBOL_GPL(synchronize_rcu); @@ -213,10 +216,24 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); */ bool poll_state_synchronize_rcu(unsigned long oldstate) { - return READ_ONCE(rcu_ctrlblk.gp_seq) != oldstate; + return oldstate == RCU_GET_STATE_COMPLETED || READ_ONCE(rcu_ctrlblk.gp_seq) != oldstate; } EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); +#ifdef CONFIG_KASAN_GENERIC +void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +{ + if (head) { + void *ptr = (void *) head - (unsigned long) func; + + kasan_record_aux_stack_noalloc(ptr); + } + + __kvfree_call_rcu(head, func); +} +EXPORT_SYMBOL_GPL(kvfree_call_rcu); +#endif + void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c25ba442044a..62e514713f7a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -154,7 +154,11 @@ static void sync_sched_exp_online_cleanup(int cpu); static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); static bool rcu_rdp_is_offloaded(struct rcu_data *rdp); -/* rcuc/rcub/rcuop kthread realtime priority */ +/* + * rcuc/rcub/rcuop kthread realtime priority. The "rcuop" + * real-time priority(enabling/disabling) is controlled by + * the extra CONFIG_RCU_NOCB_CPU_CB_BOOST configuration. + */ static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; module_param(kthread_prio, int, 0444); @@ -1775,6 +1779,79 @@ static void rcu_strict_gp_boundary(void *unused) invoke_rcu_core(); } +// Has rcu_init() been invoked? This is used (for example) to determine +// whether spinlocks may be acquired safely. +static bool rcu_init_invoked(void) +{ + return !!rcu_state.n_online_cpus; +} + +// Make the polled API aware of the beginning of a grace period. +static void rcu_poll_gp_seq_start(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) + raw_lockdep_assert_held_rcu_node(rnp); + + // If RCU was idle, note beginning of GP. + if (!rcu_seq_state(rcu_state.gp_seq_polled)) + rcu_seq_start(&rcu_state.gp_seq_polled); + + // Either way, record current state. + *snap = rcu_state.gp_seq_polled; +} + +// Make the polled API aware of the end of a grace period. +static void rcu_poll_gp_seq_end(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) + raw_lockdep_assert_held_rcu_node(rnp); + + // If the previously noted GP is still in effect, record the + // end of that GP. Either way, zero counter to avoid counter-wrap + // problems. + if (*snap && *snap == rcu_state.gp_seq_polled) { + rcu_seq_end(&rcu_state.gp_seq_polled); + rcu_state.gp_seq_polled_snap = 0; + rcu_state.gp_seq_polled_exp_snap = 0; + } else { + *snap = 0; + } +} + +// Make the polled API aware of the beginning of a grace period, but +// where caller does not hold the root rcu_node structure's lock. +static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) { + lockdep_assert_irqs_enabled(); + raw_spin_lock_irq_rcu_node(rnp); + } + rcu_poll_gp_seq_start(snap); + if (rcu_init_invoked()) + raw_spin_unlock_irq_rcu_node(rnp); +} + +// Make the polled API aware of the end of a grace period, but where +// caller does not hold the root rcu_node structure's lock. +static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) { + lockdep_assert_irqs_enabled(); + raw_spin_lock_irq_rcu_node(rnp); + } + rcu_poll_gp_seq_end(snap); + if (rcu_init_invoked()) + raw_spin_unlock_irq_rcu_node(rnp); +} + /* * Initialize a new grace period. Return false if no grace period required. */ @@ -1810,6 +1887,7 @@ static noinline_for_stack bool rcu_gp_init(void) rcu_seq_start(&rcu_state.gp_seq); ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); + rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); /* @@ -1971,19 +2049,23 @@ static void rcu_gp_fqs(bool first_time) */ static noinline_for_stack void rcu_gp_fqs_loop(void) { - bool first_gp_fqs; + bool first_gp_fqs = true; int gf = 0; unsigned long j; int ret; struct rcu_node *rnp = rcu_get_root(); - first_gp_fqs = true; j = READ_ONCE(jiffies_till_first_fqs); if (rcu_state.cbovld) gf = RCU_GP_FLAG_OVLD; ret = 0; for (;;) { - if (!ret) { + if (rcu_state.cbovld) { + j = (j + 2) / 3; + if (j <= 0) + j = 1; + } + if (!ret || time_before(jiffies + j, rcu_state.jiffies_force_qs)) { WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j); /* * jiffies_force_qs before RCU_GP_WAIT_FQS state @@ -2001,7 +2083,15 @@ static noinline_for_stack void rcu_gp_fqs_loop(void) rcu_gp_torture_wait(); WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS); /* Locking provides needed memory barriers. */ - /* If grace period done, leave loop. */ + /* + * Exit the loop if the root rcu_node structure indicates that the grace period + * has ended, leave the loop. The rcu_preempt_blocked_readers_cgp(rnp) check + * is required only for single-node rcu_node trees because readers blocking + * the current grace period are queued only on leaf rcu_node structures. + * For multi-node trees, checking the root node's ->qsmask suffices, because a + * given root node's ->qsmask bit is cleared only when all CPUs and tasks from + * the corresponding leaf nodes have passed through their quiescent state. + */ if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) break; @@ -2069,6 +2159,7 @@ static noinline void rcu_gp_cleanup(void) * safe for us to drop the lock in order to mark the grace * period as completed in all of the rcu_node structures. */ + rcu_poll_gp_seq_end(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); /* @@ -2530,7 +2621,7 @@ static void rcu_do_batch(struct rcu_data *rdp) trace_rcu_batch_end(rcu_state.name, 0, !rcu_segcblist_empty(&rdp->cblist), need_resched(), is_idle_task(current), - rcu_is_callbacks_kthread()); + rcu_is_callbacks_kthread(rdp)); return; } @@ -2608,7 +2699,7 @@ static void rcu_do_batch(struct rcu_data *rdp) rcu_nocb_lock_irqsave(rdp, flags); rdp->n_cbs_invoked += count; trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), - is_idle_task(current), rcu_is_callbacks_kthread()); + is_idle_task(current), rcu_is_callbacks_kthread(rdp)); /* Update counts and requeue any remaining callbacks. */ rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); @@ -3211,7 +3302,6 @@ struct kfree_rcu_cpu_work { * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES - * @monitor_todo: Tracks whether a @monitor_work delayed work is pending * @initialized: The @rcu_work fields have been initialized * @count: Number of objects for which GP not started * @bkvcache: @@ -3236,7 +3326,6 @@ struct kfree_rcu_cpu { struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; raw_spinlock_t lock; struct delayed_work monitor_work; - bool monitor_todo; bool initialized; int count; @@ -3416,6 +3505,18 @@ static void kfree_rcu_work(struct work_struct *work) } } +static bool +need_offload_krc(struct kfree_rcu_cpu *krcp) +{ + int i; + + for (i = 0; i < FREE_N_CHANNELS; i++) + if (krcp->bkvhead[i]) + return true; + + return !!krcp->head; +} + /* * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. */ @@ -3472,9 +3573,7 @@ static void kfree_rcu_monitor(struct work_struct *work) // of the channels that is still busy we should rearm the // work to repeat an attempt. Because previous batches are // still in progress. - if (!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head) - krcp->monitor_todo = false; - else + if (need_offload_krc(krcp)) schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); raw_spin_unlock_irqrestore(&krcp->lock, flags); @@ -3662,11 +3761,8 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) WRITE_ONCE(krcp->count, krcp->count + 1); // Set timer to drain after KFREE_DRAIN_JIFFIES. - if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && - !krcp->monitor_todo) { - krcp->monitor_todo = true; + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); - } unlock_return: krc_this_cpu_unlock(krcp, flags); @@ -3741,14 +3837,8 @@ void __init kfree_rcu_scheduler_running(void) struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); raw_spin_lock_irqsave(&krcp->lock, flags); - if ((!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head) || - krcp->monitor_todo) { - raw_spin_unlock_irqrestore(&krcp->lock, flags); - continue; - } - krcp->monitor_todo = true; - schedule_delayed_work_on(cpu, &krcp->monitor_work, - KFREE_DRAIN_JIFFIES); + if (need_offload_krc(krcp)) + schedule_delayed_work_on(cpu, &krcp->monitor_work, KFREE_DRAIN_JIFFIES); raw_spin_unlock_irqrestore(&krcp->lock, flags); } } @@ -3837,8 +3927,18 @@ void synchronize_rcu(void) lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu() in RCU read-side critical section"); - if (rcu_blocking_is_gp()) + if (rcu_blocking_is_gp()) { + // Note well that this code runs with !PREEMPT && !SMP. + // In addition, all code that advances grace periods runs at + // process level. Therefore, this normal GP overlaps with + // other normal GPs only by being fully nested within them, + // which allows reuse of ->gp_seq_polled_snap. + rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap); + rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap); + if (rcu_init_invoked()) + cond_resched_tasks_rcu_qs(); return; // Context allows vacuous grace periods. + } if (rcu_gp_is_expedited()) synchronize_rcu_expedited(); else @@ -3860,7 +3960,7 @@ unsigned long get_state_synchronize_rcu(void) * before the load from ->gp_seq. */ smp_mb(); /* ^^^ */ - return rcu_seq_snap(&rcu_state.gp_seq); + return rcu_seq_snap(&rcu_state.gp_seq_polled); } EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); @@ -3889,7 +3989,13 @@ unsigned long start_poll_synchronize_rcu(void) rdp = this_cpu_ptr(&rcu_data); rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); // irqs already disabled. - needwake = rcu_start_this_gp(rnp, rdp, gp_seq); + // Note it is possible for a grace period to have elapsed between + // the above call to get_state_synchronize_rcu() and the below call + // to rcu_seq_snap. This is OK, the worst that happens is that we + // get a grace period that no one needed. These accesses are ordered + // by smp_mb(), and we are accessing them in the opposite order + // from which they are updated at grace-period start, as required. + needwake = rcu_start_this_gp(rnp, rdp, rcu_seq_snap(&rcu_state.gp_seq)); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake(); @@ -3911,7 +4017,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); * * Yes, this function does not take counter wrap into account. * But counter wrap is harmless. If the counter wraps, we have waited for - * more than 2 billion grace periods (and way more on a 64-bit system!). + * more than a billion grace periods (and way more on a 64-bit system!). * Those needing to keep oldstate values for very long time periods * (many hours even on 32-bit systems) should check them occasionally * and either refresh them or set a flag indicating that the grace period @@ -3924,7 +4030,8 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); */ bool poll_state_synchronize_rcu(unsigned long oldstate) { - if (rcu_seq_done(&rcu_state.gp_seq, oldstate)) { + if (oldstate == RCU_GET_STATE_COMPLETED || + rcu_seq_done_exact(&rcu_state.gp_seq_polled, oldstate)) { smp_mb(); /* Ensure GP ends before subsequent accesses. */ return true; } @@ -3935,20 +4042,20 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); /** * cond_synchronize_rcu - Conditionally wait for an RCU grace period * - * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu() + * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited() * * If a full RCU grace period has elapsed since the earlier call to * get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return. * Otherwise, invoke synchronize_rcu() to wait for a full grace period. * - * Yes, this function does not take counter wrap into account. But - * counter wrap is harmless. If the counter wraps, we have waited for + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited for * more than 2 billion grace periods (and way more on a 64-bit system!), - * so waiting for one additional grace period should be just fine. + * so waiting for a couple of additional grace periods should be just fine. * * This function provides the same memory-ordering guarantees that * would be provided by a synchronize_rcu() that was invoked at the call - * to the function that provided @oldstate, and that returned at the end + * to the function that provided @oldstate and that returned at the end * of this function. */ void cond_synchronize_rcu(unsigned long oldstate) @@ -4441,6 +4548,7 @@ void rcu_report_dead(unsigned int cpu) rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags); if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */ /* Report quiescent state -before- changing ->qsmaskinitnext! */ + rcu_disable_urgency_upon_qs(rdp); rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); raw_spin_lock_irqsave_rcu_node(rnp, flags); } @@ -4486,6 +4594,7 @@ void rcutree_migrate_callbacks(int cpu) needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp); rcu_segcblist_disable(&rdp->cblist); WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist)); + check_cb_ovld_locked(my_rdp, my_rnp); if (rcu_rdp_is_offloaded(my_rdp)) { raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */ __call_rcu_nocb_wake(my_rdp, true, flags); @@ -4701,6 +4810,9 @@ static void __init rcu_init_one(void) init_waitqueue_head(&rnp->exp_wq[3]); spin_lock_init(&rnp->exp_lock); mutex_init(&rnp->boost_kthread_mutex); + raw_spin_lock_init(&rnp->exp_poll_lock); + rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED; + INIT_WORK(&rnp->exp_poll_wq, sync_rcu_do_polled_gp); } } @@ -4926,6 +5038,10 @@ void __init rcu_init(void) qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark; else qovld_calc = qovld; + + // Kick-start any polled grace periods that started early. + if (!(per_cpu_ptr(&rcu_data, cpu)->mynode->exp_seq_poll_rq & 0x1)) + (void)start_poll_synchronize_rcu_expedited(); } #include "tree_stall.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 2ccf5845957d..3cdc18997a38 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -133,6 +133,10 @@ struct rcu_node { wait_queue_head_t exp_wq[4]; struct rcu_exp_work rew; bool exp_need_flush; /* Need to flush workitem? */ + raw_spinlock_t exp_poll_lock; + /* Lock and data for polled expedited grace periods. */ + unsigned long exp_seq_poll_rq; + struct work_struct exp_poll_wq; } ____cacheline_internodealigned_in_smp; /* @@ -235,6 +239,7 @@ struct rcu_data { * if rdp_gp. */ struct list_head nocb_entry_rdp; /* rcu_data node in wakeup chain. */ + struct rcu_data *nocb_toggling_rdp; /* rdp queued for (de-)offloading */ /* The following fields are used by CB kthread, hence new cacheline. */ struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp; @@ -323,6 +328,9 @@ struct rcu_state { short gp_state; /* GP kthread sleep state. */ unsigned long gp_wake_time; /* Last GP kthread wake. */ unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */ + unsigned long gp_seq_polled; /* GP seq for polled API. */ + unsigned long gp_seq_polled_snap; /* ->gp_seq_polled at normal GP start. */ + unsigned long gp_seq_polled_exp_snap; /* ->gp_seq_polled at expedited GP start. */ /* End of fields guarded by root rcu_node's lock. */ @@ -425,7 +433,7 @@ static void rcu_flavor_sched_clock_irq(int user); static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); -static bool rcu_is_callbacks_kthread(void); +static bool rcu_is_callbacks_kthread(struct rcu_data *rdp); static void rcu_cpu_kthread_setup(unsigned int cpu); static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); @@ -481,3 +489,6 @@ static void rcu_iw_handler(struct irq_work *iwp); static void check_cpu_stall(struct rcu_data *rdp); static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, const unsigned long gpssdelay); + +/* Forward declarations for tree_exp.h. */ +static void sync_rcu_do_polled_gp(struct work_struct *wp); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 0f70f62039a9..f092c7f18a5f 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -18,6 +18,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp); static void rcu_exp_gp_seq_start(void) { rcu_seq_start(&rcu_state.expedited_sequence); + rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap); } /* @@ -34,6 +35,7 @@ static __maybe_unused unsigned long rcu_exp_gp_seq_endval(void) */ static void rcu_exp_gp_seq_end(void) { + rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap); rcu_seq_end(&rcu_state.expedited_sequence); smp_mb(); /* Ensure that consecutive grace periods serialize. */ } @@ -621,7 +623,6 @@ static void synchronize_rcu_expedited_wait(void) return; if (rcu_stall_is_suppressed()) continue; - panic_on_rcu_stall(); trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall")); pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rcu_state.name); @@ -636,10 +637,11 @@ static void synchronize_rcu_expedited_wait(void) continue; ndetected++; rdp = per_cpu_ptr(&rcu_data, cpu); - pr_cont(" %d-%c%c%c", cpu, + pr_cont(" %d-%c%c%c%c", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rnp->expmaskinit)], - "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); + "N."[!!(rdp->grpmask & rnp->expmaskinitnext)], + "D."[!!(rdp->cpu_no_qs.b.exp)]); } } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", @@ -669,6 +671,7 @@ static void synchronize_rcu_expedited_wait(void) } } jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3; + panic_on_rcu_stall(); } } @@ -913,8 +916,18 @@ void synchronize_rcu_expedited(void) "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); /* Is the state is such that the call is a grace period? */ - if (rcu_blocking_is_gp()) - return; + if (rcu_blocking_is_gp()) { + // Note well that this code runs with !PREEMPT && !SMP. + // In addition, all code that advances grace periods runs + // at process level. Therefore, this expedited GP overlaps + // with other expedited GPs only by being fully nested within + // them, which allows reuse of ->gp_seq_polled_exp_snap. + rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap); + rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap); + if (rcu_init_invoked()) + cond_resched(); + return; // Context allows vacuous grace periods. + } /* If expedited grace periods are prohibited, fall back to normal. */ if (rcu_gp_is_normal()) { @@ -950,3 +963,93 @@ void synchronize_rcu_expedited(void) synchronize_rcu_expedited_destroy_work(&rew); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +/* + * Ensure that start_poll_synchronize_rcu_expedited() has the expedited + * RCU grace periods that it needs. + */ +static void sync_rcu_do_polled_gp(struct work_struct *wp) +{ + unsigned long flags; + int i = 0; + struct rcu_node *rnp = container_of(wp, struct rcu_node, exp_poll_wq); + unsigned long s; + + raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags); + s = rnp->exp_seq_poll_rq; + rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED; + raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags); + if (s == RCU_GET_STATE_COMPLETED) + return; + while (!poll_state_synchronize_rcu(s)) { + synchronize_rcu_expedited(); + if (i == 10 || i == 20) + pr_info("%s: i = %d s = %lx gp_seq_polled = %lx\n", __func__, i, s, READ_ONCE(rcu_state.gp_seq_polled)); + i++; + } + raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags); + s = rnp->exp_seq_poll_rq; + if (poll_state_synchronize_rcu(s)) + rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED; + raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags); +} + +/** + * start_poll_synchronize_rcu_expedited - Snapshot current RCU state and start expedited grace period + * + * Returns a cookie to pass to a call to cond_synchronize_rcu(), + * cond_synchronize_rcu_expedited(), or poll_state_synchronize_rcu(), + * allowing them to determine whether or not any sort of grace period has + * elapsed in the meantime. If the needed expedited grace period is not + * already slated to start, initiates that grace period. + */ +unsigned long start_poll_synchronize_rcu_expedited(void) +{ + unsigned long flags; + struct rcu_data *rdp; + struct rcu_node *rnp; + unsigned long s; + + s = get_state_synchronize_rcu(); + rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id()); + rnp = rdp->mynode; + if (rcu_init_invoked()) + raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags); + if (!poll_state_synchronize_rcu(s)) { + rnp->exp_seq_poll_rq = s; + if (rcu_init_invoked()) + queue_work(rcu_gp_wq, &rnp->exp_poll_wq); + } + if (rcu_init_invoked()) + raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags); + + return s; +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited); + +/** + * cond_synchronize_rcu_expedited - Conditionally wait for an expedited RCU grace period + * + * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited() + * + * If any type of full RCU grace period has elapsed since the earlier + * call to get_state_synchronize_rcu(), start_poll_synchronize_rcu(), + * or start_poll_synchronize_rcu_expedited(), just return. Otherwise, + * invoke synchronize_rcu_expedited() to wait for a full grace period. + * + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!), + * so waiting for a couple of additional grace periods should be just fine. + * + * This function provides the same memory-ordering guarantees that + * would be provided by a synchronize_rcu() that was invoked at the call + * to the function that provided @oldstate and that returned at the end + * of this function. + */ +void cond_synchronize_rcu_expedited(unsigned long oldstate) +{ + if (!poll_state_synchronize_rcu(oldstate)) + synchronize_rcu_expedited(); +} +EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 46694e13398a..a8f574d8850d 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -546,52 +546,51 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, } } -/* - * Check if we ignore this rdp. - * - * We check that without holding the nocb lock but - * we make sure not to miss a freshly offloaded rdp - * with the current ordering: - * - * rdp_offload_toggle() nocb_gp_enabled_cb() - * ------------------------- ---------------------------- - * WRITE flags LOCK nocb_gp_lock - * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep - * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock - * UNLOCK nocb_gp_lock READ flags - */ -static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp) -{ - u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP; - - return rcu_segcblist_test_flags(&rdp->cblist, flags); -} - -static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp, - bool *needwake_state) +static int nocb_gp_toggle_rdp(struct rcu_data *rdp, + bool *wake_state) { struct rcu_segcblist *cblist = &rdp->cblist; + unsigned long flags; + int ret; - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { - rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) - *needwake_state = true; - } - return false; + rcu_nocb_lock_irqsave(rdp, flags); + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) && + !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { + /* + * Offloading. Set our flag and notify the offload worker. + * We will handle this rdp until it ever gets de-offloaded. + */ + rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) + *wake_state = true; + ret = 1; + } else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) && + rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { + /* + * De-offloading. Clear our flag and notify the de-offload worker. + * We will ignore this rdp until it ever gets re-offloaded. + */ + rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) + *wake_state = true; + ret = 0; + } else { + WARN_ON_ONCE(1); + ret = -1; } - /* - * De-offloading. Clear our flag and notify the de-offload worker. - * We will ignore this rdp until it ever gets re-offloaded. - */ - WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); - rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) - *needwake_state = true; - return true; + rcu_nocb_unlock_irqrestore(rdp, flags); + + return ret; } +static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu) +{ + trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); + swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq, + !READ_ONCE(my_rdp->nocb_gp_sleep)); + trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep")); +} /* * No-CBs GP kthreads come here to wait for additional callbacks to show up @@ -609,7 +608,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) bool needwait_gp = false; // This prevents actual uninitialized use. bool needwake; bool needwake_gp; - struct rcu_data *rdp; + struct rcu_data *rdp, *rdp_toggling = NULL; struct rcu_node *rnp; unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning. bool wasempty = false; @@ -634,19 +633,10 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) * is added to the list, so the skipped-over rcu_data structures * won't be ignored for long. */ - list_for_each_entry_rcu(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp, 1) { - bool needwake_state = false; - - if (!nocb_gp_enabled_cb(rdp)) - continue; + list_for_each_entry(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp) { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); rcu_nocb_lock_irqsave(rdp, flags); - if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) { - rcu_nocb_unlock_irqrestore(rdp, flags); - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); - continue; - } + lockdep_assert_held(&rdp->nocb_lock); bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); if (bypass_ncbs && (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) || @@ -656,8 +646,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) { rcu_nocb_unlock_irqrestore(rdp, flags); - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); continue; /* No callbacks here, try next. */ } if (bypass_ncbs) { @@ -705,8 +693,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) } if (needwake_gp) rcu_gp_kthread_wake(); - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); } my_rdp->nocb_gp_bypass = bypass; @@ -723,13 +709,19 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) /* Polling, so trace if first poll in the series. */ if (gotcbs) trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll")); - schedule_timeout_idle(1); + if (list_empty(&my_rdp->nocb_head_rdp)) { + raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); + if (!my_rdp->nocb_toggling_rdp) + WRITE_ONCE(my_rdp->nocb_gp_sleep, true); + raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); + /* Wait for any offloading rdp */ + nocb_gp_sleep(my_rdp, cpu); + } else { + schedule_timeout_idle(1); + } } else if (!needwait_gp) { /* Wait for callbacks to appear. */ - trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); - swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq, - !READ_ONCE(my_rdp->nocb_gp_sleep)); - trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep")); + nocb_gp_sleep(my_rdp, cpu); } else { rnp = my_rdp->mynode; trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait")); @@ -739,15 +731,49 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) !READ_ONCE(my_rdp->nocb_gp_sleep)); trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait")); } + if (!rcu_nocb_poll) { raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); + // (De-)queue an rdp to/from the group if its nocb state is changing + rdp_toggling = my_rdp->nocb_toggling_rdp; + if (rdp_toggling) + my_rdp->nocb_toggling_rdp = NULL; + if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); del_timer(&my_rdp->nocb_timer); } WRITE_ONCE(my_rdp->nocb_gp_sleep, true); raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); + } else { + rdp_toggling = READ_ONCE(my_rdp->nocb_toggling_rdp); + if (rdp_toggling) { + /* + * Paranoid locking to make sure nocb_toggling_rdp is well + * reset *before* we (re)set SEGCBLIST_KTHREAD_GP or we could + * race with another round of nocb toggling for this rdp. + * Nocb locking should prevent from that already but we stick + * to paranoia, especially in rare path. + */ + raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); + my_rdp->nocb_toggling_rdp = NULL; + raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); + } + } + + if (rdp_toggling) { + bool wake_state = false; + int ret; + + ret = nocb_gp_toggle_rdp(rdp_toggling, &wake_state); + if (ret == 1) + list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp); + else if (ret == 0) + list_del(&rdp_toggling->nocb_entry_rdp); + if (wake_state) + swake_up_one(&rdp_toggling->nocb_state_wq); } + my_rdp->nocb_gp_seq = -1; WARN_ON(signal_pending(current)); } @@ -966,16 +992,15 @@ static int rdp_offload_toggle(struct rcu_data *rdp, swake_up_one(&rdp->nocb_cb_wq); raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + // Queue this rdp for add/del to/from the list to iterate on rcuog + WRITE_ONCE(rdp_gp->nocb_toggling_rdp, rdp); if (rdp_gp->nocb_gp_sleep) { rdp_gp->nocb_gp_sleep = false; wake_gp = true; } raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); - if (wake_gp) - wake_up_process(rdp_gp->nocb_gp_kthread); - - return 0; + return wake_gp; } static long rcu_nocb_rdp_deoffload(void *arg) @@ -983,9 +1008,15 @@ static long rcu_nocb_rdp_deoffload(void *arg) struct rcu_data *rdp = arg; struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; - int ret; + int wake_gp; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); + /* + * rcu_nocb_rdp_deoffload() may be called directly if + * rcuog/o[p] spawn failed, because at this time the rdp->cpu + * is not online yet. + */ + WARN_ON_ONCE((rdp->cpu != raw_smp_processor_id()) && cpu_online(rdp->cpu)); pr_info("De-offloading %d\n", rdp->cpu); @@ -1009,12 +1040,41 @@ static long rcu_nocb_rdp_deoffload(void *arg) */ rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE); invoke_rcu_core(); - ret = rdp_offload_toggle(rdp, false, flags); - swait_event_exclusive(rdp->nocb_state_wq, - !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | - SEGCBLIST_KTHREAD_GP)); - /* Stop nocb_gp_wait() from iterating over this structure. */ - list_del_rcu(&rdp->nocb_entry_rdp); + wake_gp = rdp_offload_toggle(rdp, false, flags); + + mutex_lock(&rdp_gp->nocb_gp_kthread_mutex); + if (rdp_gp->nocb_gp_kthread) { + if (wake_gp) + wake_up_process(rdp_gp->nocb_gp_kthread); + + /* + * If rcuo[p] kthread spawn failed, directly remove SEGCBLIST_KTHREAD_CB. + * Just wait SEGCBLIST_KTHREAD_GP to be cleared by rcuog. + */ + if (!rdp->nocb_cb_kthread) { + rcu_nocb_lock_irqsave(rdp, flags); + rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB); + rcu_nocb_unlock_irqrestore(rdp, flags); + } + + swait_event_exclusive(rdp->nocb_state_wq, + !rcu_segcblist_test_flags(cblist, + SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP)); + } else { + /* + * No kthread to clear the flags for us or remove the rdp from the nocb list + * to iterate. Do it here instead. Locking doesn't look stricly necessary + * but we stick to paranoia in this rare path. + */ + rcu_nocb_lock_irqsave(rdp, flags); + rcu_segcblist_clear_flags(&rdp->cblist, + SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP); + rcu_nocb_unlock_irqrestore(rdp, flags); + + list_del(&rdp->nocb_entry_rdp); + } + mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); + /* * Lock one last time to acquire latest callback updates from kthreads * so we can later handle callbacks locally without locking. @@ -1035,7 +1095,7 @@ static long rcu_nocb_rdp_deoffload(void *arg) WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); - return ret; + return 0; } int rcu_nocb_cpu_deoffload(int cpu) @@ -1043,8 +1103,8 @@ int rcu_nocb_cpu_deoffload(int cpu) struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); int ret = 0; - mutex_lock(&rcu_state.barrier_mutex); cpus_read_lock(); + mutex_lock(&rcu_state.barrier_mutex); if (rcu_rdp_is_offloaded(rdp)) { if (cpu_online(cpu)) { ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp); @@ -1055,8 +1115,8 @@ int rcu_nocb_cpu_deoffload(int cpu) ret = -EINVAL; } } - cpus_read_unlock(); mutex_unlock(&rcu_state.barrier_mutex); + cpus_read_unlock(); return ret; } @@ -1067,7 +1127,8 @@ static long rcu_nocb_rdp_offload(void *arg) struct rcu_data *rdp = arg; struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; - int ret; + int wake_gp; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); /* @@ -1077,17 +1138,10 @@ static long rcu_nocb_rdp_offload(void *arg) if (!rdp->nocb_gp_rdp) return -EINVAL; - pr_info("Offloading %d\n", rdp->cpu); + if (WARN_ON_ONCE(!rdp_gp->nocb_gp_kthread)) + return -EINVAL; - /* - * Cause future nocb_gp_wait() invocations to iterate over - * structure, resetting ->nocb_gp_sleep and waking up the related - * "rcuog". Since nocb_gp_wait() in turn locks ->nocb_gp_lock - * before setting ->nocb_gp_sleep again, we are guaranteed to - * iterate this newly added structure before "rcuog" goes to - * sleep again. - */ - list_add_tail_rcu(&rdp->nocb_entry_rdp, &rdp->nocb_gp_rdp->nocb_head_rdp); + pr_info("Offloading %d\n", rdp->cpu); /* * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING @@ -1111,7 +1165,9 @@ static long rcu_nocb_rdp_offload(void *arg) * WRITE flags READ callbacks * rcu_nocb_unlock() rcu_nocb_unlock() */ - ret = rdp_offload_toggle(rdp, true, flags); + wake_gp = rdp_offload_toggle(rdp, true, flags); + if (wake_gp) + wake_up_process(rdp_gp->nocb_gp_kthread); swait_event_exclusive(rdp->nocb_state_wq, rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) && rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); @@ -1124,7 +1180,7 @@ static long rcu_nocb_rdp_offload(void *arg) rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE); rcu_nocb_unlock_irqrestore(rdp, flags); - return ret; + return 0; } int rcu_nocb_cpu_offload(int cpu) @@ -1132,8 +1188,8 @@ int rcu_nocb_cpu_offload(int cpu) struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); int ret = 0; - mutex_lock(&rcu_state.barrier_mutex); cpus_read_lock(); + mutex_lock(&rcu_state.barrier_mutex); if (!rcu_rdp_is_offloaded(rdp)) { if (cpu_online(cpu)) { ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp); @@ -1144,8 +1200,8 @@ int rcu_nocb_cpu_offload(int cpu) ret = -EINVAL; } } - cpus_read_unlock(); mutex_unlock(&rcu_state.barrier_mutex); + cpus_read_unlock(); return ret; } @@ -1155,11 +1211,21 @@ void __init rcu_init_nohz(void) { int cpu; bool need_rcu_nocb_mask = false; + bool offload_all = false; struct rcu_data *rdp; +#if defined(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL) + if (!rcu_state.nocb_is_setup) { + need_rcu_nocb_mask = true; + offload_all = true; + } +#endif /* #if defined(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL) */ + #if defined(CONFIG_NO_HZ_FULL) - if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask)) + if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask)) { need_rcu_nocb_mask = true; + offload_all = false; /* NO_HZ_FULL has its own mask. */ + } #endif /* #if defined(CONFIG_NO_HZ_FULL) */ if (need_rcu_nocb_mask) { @@ -1180,6 +1246,9 @@ void __init rcu_init_nohz(void) cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); #endif /* #if defined(CONFIG_NO_HZ_FULL) */ + if (offload_all) + cpumask_setall(rcu_nocb_mask); + if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n"); cpumask_and(rcu_nocb_mask, cpu_possible_mask, @@ -1246,7 +1315,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) "rcuog/%d", rdp_gp->cpu); if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) { mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); - return; + goto end; } WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); if (kthread_prio) @@ -1258,12 +1327,21 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) t = kthread_run(rcu_nocb_cb_kthread, rdp, "rcuo%c/%d", rcu_state.abbr, cpu); if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) - return; + goto end; - if (kthread_prio) + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio) sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + WRITE_ONCE(rdp->nocb_cb_kthread, t); WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); + return; +end: + mutex_lock(&rcu_state.barrier_mutex); + if (rcu_rdp_is_offloaded(rdp)) { + rcu_nocb_rdp_deoffload(rdp); + cpumask_clear_cpu(cpu, rcu_nocb_mask); + } + mutex_unlock(&rcu_state.barrier_mutex); } /* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c966d680b789..7ae1551479a2 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -460,7 +460,7 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) * be quite short, for example, in the case of the call from * rcu_read_unlock_special(). */ -static void +static notrace void rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) { bool empty_exp; @@ -581,7 +581,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) * is disabled. This function cannot be expected to understand these * nuances, so the caller must handle them. */ -static bool rcu_preempt_need_deferred_qs(struct task_struct *t) +static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t) { return (__this_cpu_read(rcu_data.cpu_no_qs.b.exp) || READ_ONCE(t->rcu_read_unlock_special.s)) && @@ -595,7 +595,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) * evaluate safety in terms of interrupt, softirq, and preemption * disabling. */ -static void rcu_preempt_deferred_qs(struct task_struct *t) +static notrace void rcu_preempt_deferred_qs(struct task_struct *t) { unsigned long flags; @@ -926,7 +926,7 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) * Because there is no preemptible RCU, there can be no deferred quiescent * states. */ -static bool rcu_preempt_need_deferred_qs(struct task_struct *t) +static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t) { return false; } @@ -935,7 +935,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) // period for a quiescent state from this CPU. Note that requests from // tasks are handled when removing the task from the blocked-tasks list // below. -static void rcu_preempt_deferred_qs(struct task_struct *t) +static notrace void rcu_preempt_deferred_qs(struct task_struct *t) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -1012,6 +1012,25 @@ static void rcu_cpu_kthread_setup(unsigned int cpu) WRITE_ONCE(rdp->rcuc_activity, jiffies); } +static bool rcu_is_callbacks_nocb_kthread(struct rcu_data *rdp) +{ +#ifdef CONFIG_RCU_NOCB_CPU + return rdp->nocb_cb_kthread == current; +#else + return false; +#endif +} + +/* + * Is the current CPU running the RCU-callbacks kthread? + * Caller must have preemption disabled. + */ +static bool rcu_is_callbacks_kthread(struct rcu_data *rdp) +{ + return rdp->rcu_cpu_kthread_task == current || + rcu_is_callbacks_nocb_kthread(rdp); +} + #ifdef CONFIG_RCU_BOOST /* @@ -1140,7 +1159,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) (rnp->gp_tasks != NULL && rnp->boost_tasks == NULL && rnp->qsmask == 0 && - (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld))) { + (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld || + IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)))) { if (rnp->exp_tasks == NULL) WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -1151,15 +1171,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) } } -/* - * Is the current CPU running the RCU-callbacks kthread? - * Caller must have preemption disabled. - */ -static bool rcu_is_callbacks_kthread(void) -{ - return __this_cpu_read(rcu_data.rcu_cpu_kthread_task) == current; -} - #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) /* @@ -1242,11 +1253,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } -static bool rcu_is_callbacks_kthread(void) -{ - return false; -} - static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) { } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index fc7fef575606..2e93acad1e31 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -516,6 +516,19 @@ int rcu_cpu_stall_suppress_at_boot __read_mostly; // !0 = suppress boot stalls. EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress_at_boot); module_param(rcu_cpu_stall_suppress_at_boot, int, 0444); +/** + * get_completed_synchronize_rcu - Return a pre-completed polled state cookie + * + * Returns a value that will always be treated by functions like + * poll_state_synchronize_rcu() as a cookie whose grace period has already + * completed. + */ +unsigned long get_completed_synchronize_rcu(void) +{ + return RCU_GET_STATE_COMPLETED; +} +EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu); + #ifdef CONFIG_PROVE_RCU /* diff --git a/kernel/smp.c b/kernel/smp.c index dd215f439426..650810a6f29b 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -174,9 +174,9 @@ static int __init csdlock_debug(char *str) if (val) static_branch_enable(&csdlock_debug_enabled); - return 0; + return 1; } -early_param("csdlock_debug", csdlock_debug); +__setup("csdlock_debug=", csdlock_debug); static DEFINE_PER_CPU(call_single_data_t *, cur_csd); static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); diff --git a/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh b/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh index f17000a2ccf1..ed0ec7f0927e 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh @@ -35,7 +35,7 @@ then exit 1 fi -# Remember where we started so that we can get back and the end. +# Remember where we started so that we can get back at the end. curcommit="`git status | head -1 | awk '{ print $NF }'`" nfail=0 @@ -73,15 +73,10 @@ do # Test the specified commit. git checkout $i > $resdir/$ds/$idir/git-checkout.out 2>&1 echo git checkout return code: $? "(Commit $ntry: $i)" - kvm.sh --allcpus --duration 3 --trust-make > $resdir/$ds/$idir/kvm.sh.out 2>&1 + kvm.sh --allcpus --duration 3 --trust-make --datestamp "$ds/$idir" > $resdir/$ds/$idir/kvm.sh.out 2>&1 ret=$? echo kvm.sh return code $ret for commit $i from branch $gitbr - - # Move the build products to their resting place. - runresdir="`grep -m 1 '^Results directory:' < $resdir/$ds/$idir/kvm.sh.out | sed -e 's/^Results directory://'`" - mv $runresdir $resdir/$ds/$idir - rrd="`echo $runresdir | sed -e 's,^.*/,,'`" - echo Run results: $resdir/$ds/$idir/$rrd + echo Run results: $resdir/$ds/$idir if test "$ret" -ne 0 then # Failure, so leave all evidence intact. diff --git a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh index 0ff59bd8b640..9f0a5d5ff2dd 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh @@ -262,6 +262,7 @@ echo All batches started. `date` | tee -a "$oldrun/remote-log" # Wait for all remaining scenarios to complete and collect results. for i in $systems do + echo " ---" Waiting for $i `date` | tee -a "$oldrun/remote-log" while checkremotefile "$i" "$resdir/$ds/remote.run" do sleep 30 diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 263e16aeca0e..6c734818a875 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -164,7 +164,7 @@ do shift ;; --gdb) - TORTURE_KCONFIG_GDB_ARG="CONFIG_DEBUG_INFO=y"; export TORTURE_KCONFIG_GDB_ARG + TORTURE_KCONFIG_GDB_ARG="CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y"; export TORTURE_KCONFIG_GDB_ARG TORTURE_BOOT_GDB_ARG="nokaslr"; export TORTURE_BOOT_GDB_ARG TORTURE_QEMU_GDB_ARG="-s -S"; export TORTURE_QEMU_GDB_ARG ;; @@ -180,7 +180,7 @@ do shift ;; --kasan) - TORTURE_KCONFIG_KASAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KASAN=y"; export TORTURE_KCONFIG_KASAN_ARG + TORTURE_KCONFIG_KASAN_ARG="CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_KASAN=y"; export TORTURE_KCONFIG_KASAN_ARG if test -n "$torture_qemu_mem_default" then TORTURE_QEMU_MEM=2G @@ -192,7 +192,7 @@ do shift ;; --kcsan) - TORTURE_KCONFIG_KCSAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KCSAN=y CONFIG_KCSAN_STRICT=y CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y"; export TORTURE_KCONFIG_KCSAN_ARG + TORTURE_KCONFIG_KCSAN_ARG="CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_KCSAN=y CONFIG_KCSAN_STRICT=y CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y"; export TORTURE_KCONFIG_KCSAN_ARG ;; --kmake-arg|--kmake-args) checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$' |