diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-07-03 11:34:53 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-07-03 11:34:53 -0700 |
commit | 330e9e46253cbfab178450c976aa90ef0f3ae940 (patch) | |
tree | 4a9875d264a477449a9c9a0ff23937550d1c3234 /kernel/rcu/tree_plugin.h | |
parent | e94693f797400b9f424c69e1da9381e4846762fe (diff) | |
parent | 567b64aaefc4ef9ae3af124ae0b13dc13a6804a8 (diff) |
Merge branch 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull RCU updates from Ingo Molnar:
"The sole purpose of these changes is to shrink and simplify the RCU
code base, which has suffered from creeping bloat over the past couple
of years. The end result is a net removal of ~2700 lines of code:
79 files changed, 1496 insertions(+), 4211 deletions(-)
Plus there's a marked reduction in the Kconfig space complexity as
well, here's the number of matches on 'grep RCU' in the .config:
before after
x86-defconfig 17 15
x86-allmodconfig 33 20"
* 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (86 commits)
rcu: Remove RCU CPU stall warnings from Tiny RCU
rcu: Remove event tracing from Tiny RCU
rcu: Move RCU debug Kconfig options to kernel/rcu
rcu: Move RCU non-debug Kconfig options to kernel/rcu
rcu: Eliminate NOCBs CPU-state Kconfig options
rcu: Remove debugfs tracing
srcu: Remove Classic SRCU
srcu: Fix rcutorture-statistics typo
rcu: Remove SPARSE_RCU_POINTER Kconfig option
rcu: Remove the now-obsolete PROVE_RCU_REPEATEDLY Kconfig option
rcu: Remove typecheck() from RCU locking wrapper functions
rcu: Remove #ifdef moving rcu_end_inkernel_boot from rcupdate.h
rcu: Remove nohz_full full-system-idle state machine
rcu: Remove the RCU_KTHREAD_PRIO Kconfig option
rcu: Remove *_SLOW_* Kconfig options
srcu: Use rnp->lock wrappers to replace explicit memory barriers
rcu: Move rnp->lock wrappers for SRCU use
rcu: Convert rnp->lock wrappers to macros for SRCU use
rcu: Refactor #includes from include/linux/rcupdate.h
bcm47xx: Fix build regression
...
Diffstat (limited to 'kernel/rcu/tree_plugin.h')
-rw-r--r-- | kernel/rcu/tree_plugin.h | 573 |
1 files changed, 86 insertions, 487 deletions
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c9a48657512a..908b309d60d7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -70,7 +70,7 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ static void __init rcu_bootup_announce_oddness(void) { if (IS_ENABLED(CONFIG_RCU_TRACE)) - pr_info("\tRCU debugfs-based tracing is enabled.\n"); + pr_info("\tRCU event tracing is enabled.\n"); if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) || (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32)) pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", @@ -90,8 +90,32 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); - if (IS_ENABLED(CONFIG_RCU_BOOST)) - pr_info("\tRCU kthread priority: %d.\n", kthread_prio); +#ifdef CONFIG_RCU_BOOST + pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY); +#endif + if (blimit != DEFAULT_RCU_BLIMIT) + pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit); + if (qhimark != DEFAULT_RCU_QHIMARK) + pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark); + if (qlowmark != DEFAULT_RCU_QLOMARK) + pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark); + if (jiffies_till_first_fqs != ULONG_MAX) + pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs); + if (jiffies_till_next_fqs != ULONG_MAX) + pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs); + if (rcu_kick_kthreads) + pr_info("\tKick kthreads if too-long grace period.\n"); + if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) + pr_info("\tRCU callback double-/use-after-free debug enabled.\n"); + if (gp_preinit_delay) + pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay); + if (gp_init_delay) + pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); + if (gp_cleanup_delay) + pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay); + if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) + pr_info("\tRCU debug extended QS entry/exit.\n"); + rcupdate_announce_bootup_oddness(); } #ifdef CONFIG_PREEMPT_RCU @@ -155,6 +179,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); struct task_struct *t = current; + lockdep_assert_held(&rnp->lock); + /* * Decide where to queue the newly blocked task. In theory, * this could be an if-statement. In practice, when I tried @@ -263,6 +289,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) */ static void rcu_preempt_qs(void) { + RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_qs() invoked with preemption enabled!!!\n"); if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_preempt"), __this_cpu_read(rcu_data_p->gpnum), @@ -286,12 +313,14 @@ static void rcu_preempt_qs(void) * * Caller must disable interrupts. */ -static void rcu_preempt_note_context_switch(void) +static void rcu_preempt_note_context_switch(bool preempt) { struct task_struct *t = current; struct rcu_data *rdp; struct rcu_node *rnp; + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n"); + WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); if (t->rcu_read_lock_nesting > 0 && !t->rcu_read_unlock_special.b.blocked) { @@ -607,6 +636,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { + RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); if (rcu_preempt_has_tasks(rnp)) rnp->gp_tasks = rnp->blkd_tasks.next; @@ -643,8 +673,37 @@ static void rcu_preempt_do_callbacks(void) #endif /* #ifdef CONFIG_RCU_BOOST */ -/* - * Queue a preemptible-RCU callback for invocation after a grace period. +/** + * call_rcu() - Queue an RCU callback for invocation after a grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all pre-existing RCU read-side + * critical sections have completed. However, the callback function + * might well execute concurrently with RCU read-side critical sections + * that started after call_rcu() was invoked. RCU read-side critical + * sections are delimited by rcu_read_lock() and rcu_read_unlock(), + * and may be nested. + * + * Note that all CPUs must agree that the grace period extended beyond + * all pre-existing RCU read-side critical section. On systems with more + * than one CPU, this means that when "func()" is invoked, each CPU is + * guaranteed to have executed a full memory barrier since the end of its + * last RCU read-side critical section whose beginning preceded the call + * to call_rcu(). It also means that each CPU executing an RCU read-side + * critical section that continues beyond the start of "func()" must have + * executed a memory barrier after the call_rcu() but before the beginning + * of that RCU read-side critical section. Note that these guarantees + * include CPUs that are offline, idle, or executing in user mode, as + * well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the + * resulting RCU callback function "func()", then both CPU A and CPU B are + * guaranteed to execute a full memory barrier during the time interval + * between the call to call_rcu() and the invocation of "func()" -- even + * if CPU A and CPU B are the same CPU (but again only if the system has + * more than one CPU). */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { @@ -663,8 +722,13 @@ EXPORT_SYMBOL_GPL(call_rcu); * synchronize_rcu() was waiting. RCU read-side critical sections are * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. * - * See the description of synchronize_sched() for more detailed information - * on memory ordering guarantees. + * See the description of synchronize_sched() for more detailed + * information on memory-ordering guarantees. However, please note + * that -only- the memory-ordering guarantees apply. For example, + * synchronize_rcu() is -not- guaranteed to wait on things like code + * protected by preempt_disable(), instead, synchronize_rcu() is -only- + * guaranteed to wait on RCU read-side critical sections, that is, sections + * of code protected by rcu_read_lock(). */ void synchronize_rcu(void) { @@ -738,7 +802,7 @@ static void __init rcu_bootup_announce(void) * Because preemptible RCU does not exist, we never have to check for * CPUs being in quiescent states. */ -static void rcu_preempt_note_context_switch(void) +static void rcu_preempt_note_context_switch(bool preempt) { } @@ -835,33 +899,6 @@ void exit_rcu(void) #include "../locking/rtmutex_common.h" -#ifdef CONFIG_RCU_TRACE - -static void rcu_initiate_boost_trace(struct rcu_node *rnp) -{ - if (!rcu_preempt_has_tasks(rnp)) - rnp->n_balk_blkd_tasks++; - else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) - rnp->n_balk_exp_gp_tasks++; - else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL) - rnp->n_balk_boost_tasks++; - else if (rnp->gp_tasks != NULL && rnp->qsmask != 0) - rnp->n_balk_notblocked++; - else if (rnp->gp_tasks != NULL && - ULONG_CMP_LT(jiffies, rnp->boost_time)) - rnp->n_balk_notyet++; - else - rnp->n_balk_nos++; -} - -#else /* #ifdef CONFIG_RCU_TRACE */ - -static void rcu_initiate_boost_trace(struct rcu_node *rnp) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_TRACE */ - static void rcu_wake_cond(struct task_struct *t, int status) { /* @@ -992,8 +1029,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) { struct task_struct *t; + lockdep_assert_held(&rnp->lock); if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { - rnp->n_balk_exp_gp_tasks++; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } @@ -1009,7 +1046,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) if (t) rcu_wake_cond(t, rnp->boost_kthread_status); } else { - rcu_initiate_boost_trace(rnp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -1260,8 +1296,7 @@ static void rcu_prepare_kthreads(int cpu) int rcu_needs_cpu(u64 basemono, u64 *nextevt) { *nextevt = KTIME_MAX; - return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) - ? 0 : rcu_cpu_has_callbacks(NULL); + return rcu_cpu_has_callbacks(NULL); } /* @@ -1372,10 +1407,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); unsigned long dj; - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) { - *nextevt = KTIME_MAX; - return 0; - } + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_needs_cpu() invoked with irqs enabled!!!"); /* Snapshot to detect later posting of non-lazy callback. */ rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; @@ -1424,8 +1456,8 @@ static void rcu_prepare_for_idle(void) struct rcu_state *rsp; int tne; - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || - rcu_is_nocb_cpu(smp_processor_id())) + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_prepare_for_idle() invoked with irqs enabled!!!"); + if (rcu_is_nocb_cpu(smp_processor_id())) return; /* Handle nohz enablement switches conservatively. */ @@ -1479,8 +1511,8 @@ static void rcu_prepare_for_idle(void) */ static void rcu_cleanup_after_idle(void) { - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || - rcu_is_nocb_cpu(smp_processor_id())) + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_cleanup_after_idle() invoked with irqs enabled!!!"); + if (rcu_is_nocb_cpu(smp_processor_id())) return; if (rcu_try_advance_all_cbs()) invoke_rcu_core(); @@ -1747,7 +1779,6 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) init_swait_queue_head(&rnp->nocb_gp_wq[1]); } -#ifndef CONFIG_RCU_NOCB_CPU_ALL /* Is the specified CPU a no-CBs CPU? */ bool rcu_is_nocb_cpu(int cpu) { @@ -1755,7 +1786,6 @@ bool rcu_is_nocb_cpu(int cpu) return cpumask_test_cpu(cpu, rcu_nocb_mask); return false; } -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ /* * Kick the leader kthread for this NOCB group. @@ -1769,6 +1799,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { /* Prior smp_mb__after_atomic() orders against prior enqueue. */ WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); + smp_mb(); /* ->nocb_leader_sleep before swake_up(). */ swake_up(&rdp_leader->nocb_wq); } } @@ -1860,7 +1891,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else { - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE); + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE); /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, @@ -1874,7 +1905,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_FORCE); + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE); /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, @@ -2023,6 +2054,7 @@ wait_again: * nocb_gp_head, where they await a grace period. */ gotcbs = false; + smp_mb(); /* wakeup before ->nocb_head reads. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); if (!rdp->nocb_gp_head) @@ -2201,8 +2233,8 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) if (!rcu_nocb_need_deferred_wakeup(rdp)) return; ndw = READ_ONCE(rdp->nocb_defer_wakeup); - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT); - wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE); + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake")); } @@ -2212,10 +2244,6 @@ void __init rcu_init_nohz(void) bool need_rcu_nocb_mask = true; struct rcu_state *rsp; -#ifdef CONFIG_RCU_NOCB_CPU_NONE - need_rcu_nocb_mask = false; -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ - #if defined(CONFIG_NO_HZ_FULL) if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) need_rcu_nocb_mask = true; @@ -2231,14 +2259,6 @@ void __init rcu_init_nohz(void) if (!have_rcu_nocb_mask) return; -#ifdef CONFIG_RCU_NOCB_CPU_ZERO - pr_info("\tOffload RCU callbacks from CPU 0\n"); - cpumask_set_cpu(0, rcu_nocb_mask); -#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ -#ifdef CONFIG_RCU_NOCB_CPU_ALL - pr_info("\tOffload RCU callbacks from all CPUs\n"); - cpumask_copy(rcu_nocb_mask, cpu_possible_mask); -#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ #if defined(CONFIG_NO_HZ_FULL) if (tick_nohz_full_running) cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); @@ -2491,421 +2511,6 @@ static void __maybe_unused rcu_kick_nohz_cpu(int cpu) #endif /* #ifdef CONFIG_NO_HZ_FULL */ } - -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - -static int full_sysidle_state; /* Current system-idle state. */ -#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ -#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ -#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ -#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */ -#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */ - -/* - * Invoked to note exit from irq or task transition to idle. Note that - * usermode execution does -not- count as idle here! After all, we want - * to detect full-system idle states, not RCU quiescent states and grace - * periods. The caller must have disabled interrupts. - */ -static void rcu_sysidle_enter(int irq) -{ - unsigned long j; - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - - /* If there are no nohz_full= CPUs, no need to track this. */ - if (!tick_nohz_full_enabled()) - return; - - /* Adjust nesting, check for fully idle. */ - if (irq) { - rdtp->dynticks_idle_nesting--; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); - if (rdtp->dynticks_idle_nesting != 0) - return; /* Still not fully idle. */ - } else { - if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) == - DYNTICK_TASK_NEST_VALUE) { - rdtp->dynticks_idle_nesting = 0; - } else { - rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); - return; /* Still not fully idle. */ - } - } - - /* Record start of fully idle period. */ - j = jiffies; - WRITE_ONCE(rdtp->dynticks_idle_jiffies, j); - smp_mb__before_atomic(); - atomic_inc(&rdtp->dynticks_idle); - smp_mb__after_atomic(); - WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); -} - -/* - * Unconditionally force exit from full system-idle state. This is - * invoked when a normal CPU exits idle, but must be called separately - * for the timekeeping CPU (tick_do_timer_cpu). The reason for this - * is that the timekeeping CPU is permitted to take scheduling-clock - * interrupts while the system is in system-idle state, and of course - * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock - * interrupt from any other type of interrupt. - */ -void rcu_sysidle_force_exit(void) -{ - int oldstate = READ_ONCE(full_sysidle_state); - int newoldstate; - - /* - * Each pass through the following loop attempts to exit full - * system-idle state. If contention proves to be a problem, - * a trylock-based contention tree could be used here. - */ - while (oldstate > RCU_SYSIDLE_SHORT) { - newoldstate = cmpxchg(&full_sysidle_state, - oldstate, RCU_SYSIDLE_NOT); - if (oldstate == newoldstate && - oldstate == RCU_SYSIDLE_FULL_NOTED) { - rcu_kick_nohz_cpu(tick_do_timer_cpu); - return; /* We cleared it, done! */ - } - oldstate = newoldstate; - } - smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */ -} - -/* - * Invoked to note entry to irq or task transition from idle. Note that - * usermode execution does -not- count as idle here! The caller must - * have disabled interrupts. - */ -static void rcu_sysidle_exit(int irq) -{ - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - - /* If there are no nohz_full= CPUs, no need to track this. */ - if (!tick_nohz_full_enabled()) - return; - - /* Adjust nesting, check for already non-idle. */ - if (irq) { - rdtp->dynticks_idle_nesting++; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); - if (rdtp->dynticks_idle_nesting != 1) - return; /* Already non-idle. */ - } else { - /* - * Allow for irq misnesting. Yes, it really is possible - * to enter an irq handler then never leave it, and maybe - * also vice versa. Handle both possibilities. - */ - if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) { - rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); - return; /* Already non-idle. */ - } else { - rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE; - } - } - - /* Record end of idle period. */ - smp_mb__before_atomic(); - atomic_inc(&rdtp->dynticks_idle); - smp_mb__after_atomic(); - WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); - - /* - * If we are the timekeeping CPU, we are permitted to be non-idle - * during a system-idle state. This must be the case, because - * the timekeeping CPU has to take scheduling-clock interrupts - * during the time that the system is transitioning to full - * system-idle state. This means that the timekeeping CPU must - * invoke rcu_sysidle_force_exit() directly if it does anything - * more than take a scheduling-clock interrupt. - */ - if (smp_processor_id() == tick_do_timer_cpu) - return; - - /* Update system-idle state: We are clearly no longer fully idle! */ - rcu_sysidle_force_exit(); -} - -/* - * Check to see if the current CPU is idle. Note that usermode execution - * does not count as idle. The caller must have disabled interrupts, - * and must be running on tick_do_timer_cpu. - */ -static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, - unsigned long *maxj) -{ - int cur; - unsigned long j; - struct rcu_dynticks *rdtp = rdp->dynticks; - - /* If there are no nohz_full= CPUs, don't check system-wide idleness. */ - if (!tick_nohz_full_enabled()) - return; - - /* - * If some other CPU has already reported non-idle, if this is - * not the flavor of RCU that tracks sysidle state, or if this - * is an offline or the timekeeping CPU, nothing to do. - */ - if (!*isidle || rdp->rsp != rcu_state_p || - cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) - return; - /* Verify affinity of current kthread. */ - WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); - - /* Pick up current idle and NMI-nesting counter and check. */ - cur = atomic_read(&rdtp->dynticks_idle); - if (cur & 0x1) { - *isidle = false; /* We are not idle! */ - return; - } - smp_mb(); /* Read counters before timestamps. */ - - /* Pick up timestamps. */ - j = READ_ONCE(rdtp->dynticks_idle_jiffies); - /* If this CPU entered idle more recently, update maxj timestamp. */ - if (ULONG_CMP_LT(*maxj, j)) - *maxj = j; -} - -/* - * Is this the flavor of RCU that is handling full-system idle? - */ -static bool is_sysidle_rcu_state(struct rcu_state *rsp) -{ - return rsp == rcu_state_p; -} - -/* - * Return a delay in jiffies based on the number of CPUs, rcu_node - * leaf fanout, and jiffies tick rate. The idea is to allow larger - * systems more time to transition to full-idle state in order to - * avoid the cache thrashing that otherwise occur on the state variable. - * Really small systems (less than a couple of tens of CPUs) should - * instead use a single global atomically incremented counter, and later - * versions of this will automatically reconfigure themselves accordingly. - */ -static unsigned long rcu_sysidle_delay(void) -{ - if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) - return 0; - return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000); -} - -/* - * Advance the full-system-idle state. This is invoked when all of - * the non-timekeeping CPUs are idle. - */ -static void rcu_sysidle(unsigned long j) -{ - /* Check the current state. */ - switch (READ_ONCE(full_sysidle_state)) { - case RCU_SYSIDLE_NOT: - - /* First time all are idle, so note a short idle period. */ - WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_SHORT); - break; - - case RCU_SYSIDLE_SHORT: - - /* - * Idle for a bit, time to advance to next state? - * cmpxchg failure means race with non-idle, let them win. - */ - if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) - (void)cmpxchg(&full_sysidle_state, - RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); - break; - - case RCU_SYSIDLE_LONG: - - /* - * Do an additional check pass before advancing to full. - * cmpxchg failure means race with non-idle, let them win. - */ - if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) - (void)cmpxchg(&full_sysidle_state, - RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL); - break; - - default: - break; - } -} - -/* - * Found a non-idle non-timekeeping CPU, so kick the system-idle state - * back to the beginning. - */ -static void rcu_sysidle_cancel(void) -{ - smp_mb(); - if (full_sysidle_state > RCU_SYSIDLE_SHORT) - WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_NOT); -} - -/* - * Update the sysidle state based on the results of a force-quiescent-state - * scan of the CPUs' dyntick-idle state. - */ -static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, - unsigned long maxj, bool gpkt) -{ - if (rsp != rcu_state_p) - return; /* Wrong flavor, ignore. */ - if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) - return; /* Running state machine from timekeeping CPU. */ - if (isidle) - rcu_sysidle(maxj); /* More idle! */ - else - rcu_sysidle_cancel(); /* Idle is over. */ -} - -/* - * Wrapper for rcu_sysidle_report() when called from the grace-period - * kthread's context. - */ -static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, - unsigned long maxj) -{ - /* If there are no nohz_full= CPUs, no need to track this. */ - if (!tick_nohz_full_enabled()) - return; - - rcu_sysidle_report(rsp, isidle, maxj, true); -} - -/* Callback and function for forcing an RCU grace period. */ -struct rcu_sysidle_head { - struct rcu_head rh; - int inuse; -}; - -static void rcu_sysidle_cb(struct rcu_head *rhp) -{ - struct rcu_sysidle_head *rshp; - - /* - * The following memory barrier is needed to replace the - * memory barriers that would normally be in the memory - * allocator. - */ - smp_mb(); /* grace period precedes setting inuse. */ - - rshp = container_of(rhp, struct rcu_sysidle_head, rh); - WRITE_ONCE(rshp->inuse, 0); -} - -/* - * Check to see if the system is fully idle, other than the timekeeping CPU. - * The caller must have disabled interrupts. This is not intended to be - * called unless tick_nohz_full_enabled(). - */ -bool rcu_sys_is_idle(void) -{ - static struct rcu_sysidle_head rsh; - int rss = READ_ONCE(full_sysidle_state); - - if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) - return false; - - /* Handle small-system case by doing a full scan of CPUs. */ - if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) { - int oldrss = rss - 1; - - /* - * One pass to advance to each state up to _FULL. - * Give up if any pass fails to advance the state. - */ - while (rss < RCU_SYSIDLE_FULL && oldrss < rss) { - int cpu; - bool isidle = true; - unsigned long maxj = jiffies - ULONG_MAX / 4; - struct rcu_data *rdp; - - /* Scan all the CPUs looking for nonidle CPUs. */ - for_each_possible_cpu(cpu) { - rdp = per_cpu_ptr(rcu_state_p->rda, cpu); - rcu_sysidle_check_cpu(rdp, &isidle, &maxj); - if (!isidle) - break; - } - rcu_sysidle_report(rcu_state_p, isidle, maxj, false); - oldrss = rss; - rss = READ_ONCE(full_sysidle_state); - } - } - - /* If this is the first observation of an idle period, record it. */ - if (rss == RCU_SYSIDLE_FULL) { - rss = cmpxchg(&full_sysidle_state, - RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED); - return rss == RCU_SYSIDLE_FULL; - } - - smp_mb(); /* ensure rss load happens before later caller actions. */ - - /* If already fully idle, tell the caller (in case of races). */ - if (rss == RCU_SYSIDLE_FULL_NOTED) - return true; - - /* - * If we aren't there yet, and a grace period is not in flight, - * initiate a grace period. Either way, tell the caller that - * we are not there yet. We use an xchg() rather than an assignment - * to make up for the memory barriers that would otherwise be - * provided by the memory allocator. - */ - if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && - !rcu_gp_in_progress(rcu_state_p) && - !rsh.inuse && xchg(&rsh.inuse, 1) == 0) - call_rcu(&rsh.rh, rcu_sysidle_cb); - return false; -} - -/* - * Initialize dynticks sysidle state for CPUs coming online. - */ -static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) -{ - rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE; -} - -#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ - -static void rcu_sysidle_enter(int irq) -{ -} - -static void rcu_sysidle_exit(int irq) -{ -} - -static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, - unsigned long *maxj) -{ -} - -static bool is_sysidle_rcu_state(struct rcu_state *rsp) -{ - return false; -} - -static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, - unsigned long maxj) -{ -} - -static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) -{ -} - -#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ - /* * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the * grace-period kthread will do force_quiescent_state() processing? @@ -2936,13 +2541,7 @@ static void rcu_bind_gp_kthread(void) if (!tick_nohz_full_enabled()) return; -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - cpu = tick_do_timer_cpu; - if (cpu >= 0 && cpu < nr_cpu_ids) - set_cpus_allowed_ptr(current, cpumask_of(cpu)); -#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ housekeeping_affine(current); -#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ } /* Record the current task on dyntick-idle entry. */ |