From af446b702c58b700cc5fa99f6edc78b99e55b995 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 10 Sep 2011 21:54:08 -0700 Subject: [PATCH 01/64] rcu: ->signaled better named ->fqs_state The ->signaled field was named before complications in the form of dyntick-idle mode and offlined CPUs. These complications have required that force_quiescent_state() be implemented as a state machine, instead of simply unconditionally sending reschedule IPIs. Therefore, this commit renames ->signaled to ->fqs_state to catch up with the new force_quiescent_state() reality. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 16 ++++++++-------- kernel/rcutree.h | 4 ++-- kernel/rcutree_trace.c | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 6b76d812740ce0..5d0b55a3a8c002 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -69,7 +69,7 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; NUM_RCU_LVL_3, \ NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ }, \ - .signaled = RCU_GP_IDLE, \ + .fqs_state = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ @@ -866,8 +866,8 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) /* Advance to a new grace period and initialize state. */ rsp->gpnum++; trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); - WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); - rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ + WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT); + rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */ rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; record_gp_stall_check_time(rsp); @@ -877,7 +877,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) rnp->qsmask = rnp->qsmaskinit; rnp->gpnum = rsp->gpnum; rnp->completed = rsp->completed; - rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ + rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */ rcu_start_gp_per_cpu(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); trace_rcu_grace_period_init(rsp->name, rnp->gpnum, @@ -927,7 +927,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) rnp = rcu_get_root(rsp); raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ + rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ raw_spin_unlock_irqrestore(&rsp->onofflock, flags); } @@ -991,7 +991,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ trace_rcu_grace_period(rsp->name, rsp->completed, "end"); - rsp->signaled = RCU_GP_IDLE; + rsp->fqs_state = RCU_GP_IDLE; rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ } @@ -1457,7 +1457,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) goto unlock_fqs_ret; /* no GP in progress, time updated. */ } rsp->fqs_active = 1; - switch (rsp->signaled) { + switch (rsp->fqs_state) { case RCU_GP_IDLE: case RCU_GP_INIT: @@ -1473,7 +1473,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) force_qs_rnp(rsp, dyntick_save_progress_counter); raw_spin_lock(&rnp->lock); /* irqs already disabled */ if (rcu_gp_in_progress(rsp)) - rsp->signaled = RCU_FORCE_QS; + rsp->fqs_state = RCU_FORCE_QS; break; case RCU_FORCE_QS: diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 849ce9ec51fec1..517f2f89a29323 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -302,7 +302,7 @@ struct rcu_data { struct rcu_state *rsp; }; -/* Values for signaled field in struct rcu_state. */ +/* Values for fqs_state field in struct rcu_state. */ #define RCU_GP_IDLE 0 /* No grace period in progress. */ #define RCU_GP_INIT 1 /* Grace period being initialized. */ #define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ @@ -361,7 +361,7 @@ struct rcu_state { /* The following fields are guarded by the root rcu_node's lock. */ - u8 signaled ____cacheline_internodealigned_in_smp; + u8 fqs_state ____cacheline_internodealigned_in_smp; /* Force QS state. */ u8 fqs_active; /* force_quiescent_state() */ /* is running. */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 9feffa4c069567..59c7bee4ce0f78 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -278,7 +278,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) gpnum = rsp->gpnum; seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", - rsp->completed, gpnum, rsp->signaled, + rsp->completed, gpnum, rsp->fqs_state, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff), rsp->n_force_qs, rsp->n_force_qs_ngp, From 389abd48efe1ceacb141b2fd151263b1bc432dbc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 21 Sep 2011 14:41:37 -0700 Subject: [PATCH 02/64] rcu: Avoid RCU-preempt expedited grace-period botch Because rcu_read_unlock_special() samples rcu_preempted_readers_exp(rnp) after dropping rnp->lock, the following sequence of events is possible: 1. Task A exits its RCU read-side critical section, and removes itself from the ->blkd_tasks list, releases rnp->lock, and is then preempted. Task B remains on the ->blkd_tasks list, and blocks the current expedited grace period. 2. Task B exits from its RCU read-side critical section and removes itself from the ->blkd_tasks list. Because it is the last task blocking the current expedited grace period, it ends that expedited grace period. 3. Task A resumes, and samples rcu_preempted_readers_exp(rnp) which of course indicates that nothing is blocking the nonexistent expedited grace period. Task A is again preempted. 4. Some other CPU starts an expedited grace period. There are several tasks blocking this expedited grace period queued on the same rcu_node structure that Task A was using in step 1 above. 5. Task A examines its state and incorrectly concludes that it was the last task blocking the expedited grace period on the current rcu_node structure. It therefore reports completion up the rcu_node tree. 6. The expedited grace period can then incorrectly complete before the tasks blocked on this same rcu_node structure exit their RCU read-side critical sections. Arbitrarily bad things happen. This commit therefore takes a snapshot of rcu_preempted_readers_exp(rnp) prior to dropping the lock, so that only the last task thinks that it is the last task, thus avoiding the failure scenario laid out above. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree_plugin.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 4b9b9f8a41846d..798605317161d8 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -312,6 +312,7 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) { int empty; int empty_exp; + int empty_exp_now; unsigned long flags; struct list_head *np; #ifdef CONFIG_RCU_BOOST @@ -382,8 +383,10 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) /* * If this was the last task on the current list, and if * we aren't waiting on any CPUs, report the quiescent state. - * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. + * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, + * so we must take a snapshot of the expedited state. */ + empty_exp_now = !rcu_preempted_readers_exp(rnp); if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { trace_rcu_quiescent_state_report("preempt_rcu", rnp->gpnum, @@ -406,7 +409,7 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) * If this was the last task on the expedited lists, * then we need to report up the rcu_node hierarchy. */ - if (!empty_exp && !rcu_preempted_readers_exp(rnp)) + if (!empty_exp && empty_exp_now) rcu_report_exp_rnp(&rcu_preempt_state, rnp); } else { local_irq_restore(flags); From 7077714ec4940a6c5b1189c3afb4f47bf49ad877 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 22 Sep 2011 13:18:44 -0700 Subject: [PATCH 03/64] rcu: Make synchronize_sched_expedited() better at work sharing When synchronize_sched_expedited() takes its second and subsequent snapshots of sync_sched_expedited_started, it subtracts 1. This means that the concurrent caller of synchronize_sched_expedited() that incremented to that value sees our successful completion, it will not be able to take advantage of it. This restriction is pointless, given that our full expedited grace period would have happened after the other guy started, and thus should be able to serve as a proxy for the other guy successfully executing try_stop_cpus(). This commit therefore removes the subtraction of 1. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 798605317161d8..708dc579634d4d 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1910,7 +1910,7 @@ void synchronize_sched_expedited(void) * grace period works for us. */ get_online_cpus(); - snap = atomic_read(&sync_sched_expedited_started) - 1; + snap = atomic_read(&sync_sched_expedited_started); smp_mb(); /* ensure read is before try_stop_cpus(). */ } From b804cb9e91c6c304959c69d4f9daeef4ffdba71c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 28 Sep 2011 10:23:39 -0700 Subject: [PATCH 04/64] lockdep: Update documentation for lock-class leak detection There are a number of bugs that can leak or overuse lock classes, which can cause the maximum number of lock classes (currently 8191) to be exceeded. However, the documentation does not tell you how to track down these problems. This commit addresses this shortcoming. Signed-off-by: Paul E. McKenney --- Documentation/lockdep-design.txt | 63 ++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/Documentation/lockdep-design.txt b/Documentation/lockdep-design.txt index abf768c681e208..5dbc99c04f6e3d 100644 --- a/Documentation/lockdep-design.txt +++ b/Documentation/lockdep-design.txt @@ -221,3 +221,66 @@ when the chain is validated for the first time, is then put into a hash table, which hash-table can be checked in a lockfree manner. If the locking chain occurs again later on, the hash table tells us that we dont have to validate the chain again. + +Troubleshooting: +---------------- + +The validator tracks a maximum of MAX_LOCKDEP_KEYS number of lock classes. +Exceeding this number will trigger the following lockdep warning: + + (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) + +By default, MAX_LOCKDEP_KEYS is currently set to 8191, and typical +desktop systems have less than 1,000 lock classes, so this warning +normally results from lock-class leakage or failure to properly +initialize locks. These two problems are illustrated below: + +1. Repeated module loading and unloading while running the validator + will result in lock-class leakage. The issue here is that each + load of the module will create a new set of lock classes for + that module's locks, but module unloading does not remove old + classes (see below discussion of reuse of lock classes for why). + Therefore, if that module is loaded and unloaded repeatedly, + the number of lock classes will eventually reach the maximum. + +2. Using structures such as arrays that have large numbers of + locks that are not explicitly initialized. For example, + a hash table with 8192 buckets where each bucket has its own + spinlock_t will consume 8192 lock classes -unless- each spinlock + is explicitly initialized at runtime, for example, using the + run-time spin_lock_init() as opposed to compile-time initializers + such as __SPIN_LOCK_UNLOCKED(). Failure to properly initialize + the per-bucket spinlocks would guarantee lock-class overflow. + In contrast, a loop that called spin_lock_init() on each lock + would place all 8192 locks into a single lock class. + + The moral of this story is that you should always explicitly + initialize your locks. + +One might argue that the validator should be modified to allow +lock classes to be reused. However, if you are tempted to make this +argument, first review the code and think through the changes that would +be required, keeping in mind that the lock classes to be removed are +likely to be linked into the lock-dependency graph. This turns out to +be harder to do than to say. + +Of course, if you do run out of lock classes, the next thing to do is +to find the offending lock classes. First, the following command gives +you the number of lock classes currently in use along with the maximum: + + grep "lock-classes" /proc/lockdep_stats + +This command produces the following output on a modest system: + + lock-classes: 748 [max: 8191] + +If the number allocated (748 above) increases continually over time, +then there is likely a leak. The following command can be used to +identify the leaking lock classes: + + grep "BD" /proc/lockdep + +Run the command and save the output, then compare against the output from +a later run of this command to identify the leakers. This same output +can also help you find situations where runtime lock initialization has +been omitted. From 9b2e4f1880b789be1f24f9684f7a54b90310b5c0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 30 Sep 2011 12:10:22 -0700 Subject: [PATCH 05/64] rcu: Track idleness independent of idle tasks Earlier versions of RCU used the scheduling-clock tick to detect idleness by checking for the idle task, but handled idleness differently for CONFIG_NO_HZ=y. But there are now a number of uses of RCU read-side critical sections in the idle task, for example, for tracing. A more fine-grained detection of idleness is therefore required. This commit presses the old dyntick-idle code into full-time service, so that rcu_idle_enter(), previously known as rcu_enter_nohz(), is always invoked at the beginning of an idle loop iteration. Similarly, rcu_idle_exit(), previously known as rcu_exit_nohz(), is always invoked at the end of an idle-loop iteration. This allows the idle task to use RCU everywhere except between consecutive rcu_idle_enter() and rcu_idle_exit() calls, in turn allowing architecture maintainers to specify exactly where in the idle loop that RCU may be used. Because some of the userspace upcall uses can result in what looks to RCU like half of an interrupt, it is not possible to expect that the irq_enter() and irq_exit() hooks will give exact counts. This patch therefore expands the ->dynticks_nesting counter to 64 bits and uses two separate bitfields to count process/idle transitions and interrupt entry/exit transitions. It is presumed that userspace upcalls do not happen in the idle loop or from usermode execution (though usermode might do a system call that results in an upcall). The counter is hard-reset on each process/idle transition, which avoids the interrupt entry/exit error from accumulating. Overflow is avoided by the 64-bitness of the ->dyntick_nesting counter. This commit also adds warnings if a non-idle task asks RCU to enter idle state (and these checks will need some adjustment before applying Frederic's OS-jitter patches (http://lkml.org/lkml/2011/10/7/246). In addition, validation of ->dynticks and ->dynticks_nesting is added. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- Documentation/RCU/trace.txt | 4 - include/linux/hardirq.h | 21 ---- include/linux/rcupdate.h | 21 +--- include/linux/tick.h | 11 +- include/trace/events/rcu.h | 10 +- kernel/rcutiny.c | 124 ++++++++++++++++--- kernel/rcutree.c | 229 +++++++++++++++++++++++++----------- kernel/rcutree.h | 15 +-- kernel/rcutree_trace.c | 10 +- kernel/time/tick-sched.c | 6 +- 10 files changed, 297 insertions(+), 154 deletions(-) diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index aaf65f6c6cd784..49587abfc2f77c 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -105,14 +105,10 @@ o "dt" is the current value of the dyntick counter that is incremented or one greater than the interrupt-nesting depth otherwise. The number after the second "/" is the NMI nesting depth. - This field is displayed only for CONFIG_NO_HZ kernels. - o "df" is the number of times that some other CPU has forced a quiescent state on behalf of this CPU due to this CPU being in dynticks-idle state. - This field is displayed only for CONFIG_NO_HZ kernels. - o "of" is the number of times that some other CPU has forced a quiescent state on behalf of this CPU due to this CPU being offline. In a perfect world, this might never happen, but it diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index f743883f769e9b..bb7f30971858ca 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -139,20 +139,7 @@ static inline void account_system_vtime(struct task_struct *tsk) extern void account_system_vtime(struct task_struct *tsk); #endif -#if defined(CONFIG_NO_HZ) #if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU) -extern void rcu_enter_nohz(void); -extern void rcu_exit_nohz(void); - -static inline void rcu_irq_enter(void) -{ - rcu_exit_nohz(); -} - -static inline void rcu_irq_exit(void) -{ - rcu_enter_nohz(); -} static inline void rcu_nmi_enter(void) { @@ -163,17 +150,9 @@ static inline void rcu_nmi_exit(void) } #else -extern void rcu_irq_enter(void); -extern void rcu_irq_exit(void); extern void rcu_nmi_enter(void); extern void rcu_nmi_exit(void); #endif -#else -# define rcu_irq_enter() do { } while (0) -# define rcu_irq_exit() do { } while (0) -# define rcu_nmi_enter() do { } while (0) -# define rcu_nmi_exit() do { } while (0) -#endif /* #if defined(CONFIG_NO_HZ) */ /* * It is safe to do non-atomic ops on ->hardirq_context, diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 2cf4226ade7e60..cd1ad4b04c6d69 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -177,23 +177,10 @@ extern void rcu_sched_qs(int cpu); extern void rcu_bh_qs(int cpu); extern void rcu_check_callbacks(int cpu, int user); struct notifier_block; - -#ifdef CONFIG_NO_HZ - -extern void rcu_enter_nohz(void); -extern void rcu_exit_nohz(void); - -#else /* #ifdef CONFIG_NO_HZ */ - -static inline void rcu_enter_nohz(void) -{ -} - -static inline void rcu_exit_nohz(void) -{ -} - -#endif /* #else #ifdef CONFIG_NO_HZ */ +extern void rcu_idle_enter(void); +extern void rcu_idle_exit(void); +extern void rcu_irq_enter(void); +extern void rcu_irq_exit(void); /* * Infrastructure to implement the synchronize_() primitives in diff --git a/include/linux/tick.h b/include/linux/tick.h index b232ccc0ee291d..ca40838fdfb723 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -127,8 +127,15 @@ extern ktime_t tick_nohz_get_sleep_length(void); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); # else -static inline void tick_nohz_stop_sched_tick(int inidle) { } -static inline void tick_nohz_restart_sched_tick(void) { } +static inline void tick_nohz_stop_sched_tick(int inidle) +{ + if (inidle) + rcu_idle_enter(); +} +static inline void tick_nohz_restart_sched_tick(void) +{ + rcu_idle_exit(); +} static inline ktime_t tick_nohz_get_sleep_length(void) { ktime_t len = { .tv64 = NSEC_PER_SEC/HZ }; diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 669fbd62ec2558..e5771804c5073b 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -246,19 +246,21 @@ TRACE_EVENT(rcu_fqs, */ TRACE_EVENT(rcu_dyntick, - TP_PROTO(char *polarity), + TP_PROTO(char *polarity, int nesting), - TP_ARGS(polarity), + TP_ARGS(polarity, nesting), TP_STRUCT__entry( __field(char *, polarity) + __field(int, nesting) ), TP_fast_assign( __entry->polarity = polarity; + __entry->nesting = nesting; ), - TP_printk("%s", __entry->polarity) + TP_printk("%s %d", __entry->polarity, __entry->nesting) ); /* @@ -443,7 +445,7 @@ TRACE_EVENT(rcu_batch_end, #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0) #define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks) do { } while (0) #define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0) -#define trace_rcu_dyntick(polarity) do { } while (0) +#define trace_rcu_dyntick(polarity, nesting) do { } while (0) #define trace_rcu_callback(rcuname, rhp, qlen) do { } while (0) #define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen) do { } while (0) #define trace_rcu_batch_start(rcuname, qlen, blimit) do { } while (0) diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 636af6d9c6e565..3ab77bdc90c4a1 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -53,31 +53,122 @@ static void __call_rcu(struct rcu_head *head, #include "rcutiny_plugin.h" -#ifdef CONFIG_NO_HZ +static long long rcu_dynticks_nesting = LLONG_MAX / 2; -static long rcu_dynticks_nesting = 1; +/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ +static void rcu_idle_enter_common(void) +{ + if (rcu_dynticks_nesting) { + RCU_TRACE(trace_rcu_dyntick("--=", rcu_dynticks_nesting)); + return; + } + RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting)); + if (!idle_cpu(smp_processor_id())) { + WARN_ON_ONCE(1); /* must be idle task! */ + RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", + rcu_dynticks_nesting)); + ftrace_dump(DUMP_ALL); + } + rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ +} /* - * Enter dynticks-idle mode, which is an extended quiescent state - * if we have fully entered that mode (i.e., if the new value of - * dynticks_nesting is zero). + * Enter idle, which is an extended quiescent state if we have fully + * entered that mode (i.e., if the new value of dynticks_nesting is zero). */ -void rcu_enter_nohz(void) +void rcu_idle_enter(void) { - if (--rcu_dynticks_nesting == 0) - rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ + unsigned long flags; + + local_irq_save(flags); + rcu_dynticks_nesting = 0; + rcu_idle_enter_common(); + local_irq_restore(flags); } /* - * Exit dynticks-idle mode, so that we are no longer in an extended - * quiescent state. + * Exit an interrupt handler towards idle. + */ +void rcu_irq_exit(void) +{ + unsigned long flags; + + local_irq_save(flags); + rcu_dynticks_nesting--; + WARN_ON_ONCE(rcu_dynticks_nesting < 0); + rcu_idle_enter_common(); + local_irq_restore(flags); +} + +/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ +static void rcu_idle_exit_common(long long oldval) +{ + if (oldval) { + RCU_TRACE(trace_rcu_dyntick("++=", rcu_dynticks_nesting)); + return; + } + RCU_TRACE(trace_rcu_dyntick("End", oldval)); + if (!idle_cpu(smp_processor_id())) { + WARN_ON_ONCE(1); /* must be idle task! */ + RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", + oldval)); + ftrace_dump(DUMP_ALL); + } +} + +/* + * Exit idle, so that we are no longer in an extended quiescent state. */ -void rcu_exit_nohz(void) +void rcu_idle_exit(void) { + unsigned long flags; + long long oldval; + + local_irq_save(flags); + oldval = rcu_dynticks_nesting; + WARN_ON_ONCE(oldval != 0); + rcu_dynticks_nesting = LLONG_MAX / 2; + rcu_idle_exit_common(oldval); + local_irq_restore(flags); +} + +/* + * Enter an interrupt handler, moving away from idle. + */ +void rcu_irq_enter(void) +{ + unsigned long flags; + long long oldval; + + local_irq_save(flags); + oldval = rcu_dynticks_nesting; rcu_dynticks_nesting++; + WARN_ON_ONCE(rcu_dynticks_nesting == 0); + rcu_idle_exit_common(oldval); + local_irq_restore(flags); +} + +#ifdef CONFIG_PROVE_RCU + +/* + * Test whether RCU thinks that the current CPU is idle. + */ +int rcu_is_cpu_idle(void) +{ + return !rcu_dynticks_nesting; } -#endif /* #ifdef CONFIG_NO_HZ */ +#endif /* #ifdef CONFIG_PROVE_RCU */ + +/* + * Test whether the current CPU was interrupted from idle. Nested + * interrupts don't count, we must be running at the first interrupt + * level. + */ +int rcu_is_cpu_rrupt_from_idle(void) +{ + return rcu_dynticks_nesting <= 0; +} /* * Helper function for rcu_sched_qs() and rcu_bh_qs(). @@ -126,14 +217,13 @@ void rcu_bh_qs(int cpu) /* * Check to see if the scheduling-clock interrupt came from an extended - * quiescent state, and, if so, tell RCU about it. + * quiescent state, and, if so, tell RCU about it. This function must + * be called from hardirq context. It is normally called from the + * scheduling-clock interrupt. */ void rcu_check_callbacks(int cpu, int user) { - if (user || - (idle_cpu(cpu) && - !in_softirq() && - hardirq_count() <= (1 << HARDIRQ_SHIFT))) + if (user || rcu_is_cpu_rrupt_from_idle()) rcu_sched_qs(cpu); else if (!in_softirq()) rcu_bh_qs(cpu); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 5d0b55a3a8c002..1c40326724f65f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -195,12 +195,10 @@ void rcu_note_context_switch(int cpu) } EXPORT_SYMBOL_GPL(rcu_note_context_switch); -#ifdef CONFIG_NO_HZ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { - .dynticks_nesting = 1, + .dynticks_nesting = LLONG_MAX / 2, .dynticks = ATOMIC_INIT(1), }; -#endif /* #ifdef CONFIG_NO_HZ */ static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ static int qhimark = 10000; /* If this many pending, ignore blimit. */ @@ -328,11 +326,11 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) return 1; } - /* If preemptible RCU, no point in sending reschedule IPI. */ - if (rdp->preemptible) - return 0; - - /* The CPU is online, so send it a reschedule IPI. */ + /* + * The CPU is online, so send it a reschedule IPI. This forces + * it through the scheduler, and (inefficiently) also handles cases + * where idle loops fail to inform RCU about the CPU being idle. + */ if (rdp->cpu != smp_processor_id()) smp_send_reschedule(rdp->cpu); else @@ -343,51 +341,97 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) #endif /* #ifdef CONFIG_SMP */ -#ifdef CONFIG_NO_HZ +/* + * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle + * + * If the new value of the ->dynticks_nesting counter now is zero, + * we really have entered idle, and must do the appropriate accounting. + * The caller must have disabled interrupts. + */ +static void rcu_idle_enter_common(struct rcu_dynticks *rdtp) +{ + if (rdtp->dynticks_nesting) { + trace_rcu_dyntick("--=", rdtp->dynticks_nesting); + return; + } + trace_rcu_dyntick("Start", rdtp->dynticks_nesting); + if (!idle_cpu(smp_processor_id())) { + WARN_ON_ONCE(1); /* must be idle task! */ + trace_rcu_dyntick("Error on entry: not idle task", + rdtp->dynticks_nesting); + ftrace_dump(DUMP_ALL); + } + /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ + smp_mb__before_atomic_inc(); /* See above. */ + atomic_inc(&rdtp->dynticks); + smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ + WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); +} /** - * rcu_enter_nohz - inform RCU that current CPU is entering nohz + * rcu_idle_enter - inform RCU that current CPU is entering idle * - * Enter nohz mode, in other words, -leave- the mode in which RCU + * Enter idle mode, in other words, -leave- the mode in which RCU * read-side critical sections can occur. (Though RCU read-side - * critical sections can occur in irq handlers in nohz mode, a possibility - * handled by rcu_irq_enter() and rcu_irq_exit()). + * critical sections can occur in irq handlers in idle, a possibility + * handled by irq_enter() and irq_exit().) + * + * We crowbar the ->dynticks_nesting field to zero to allow for + * the possibility of usermode upcalls having messed up our count + * of interrupt nesting level during the prior busy period. */ -void rcu_enter_nohz(void) +void rcu_idle_enter(void) { unsigned long flags; struct rcu_dynticks *rdtp; local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); - if (--rdtp->dynticks_nesting) { - local_irq_restore(flags); - return; - } - trace_rcu_dyntick("Start"); - /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ - smp_mb__before_atomic_inc(); /* See above. */ - atomic_inc(&rdtp->dynticks); - smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ - WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); + rdtp->dynticks_nesting = 0; + rcu_idle_enter_common(rdtp); local_irq_restore(flags); } -/* - * rcu_exit_nohz - inform RCU that current CPU is leaving nohz +/** + * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle + * + * Exit from an interrupt handler, which might possibly result in entering + * idle mode, in other words, leaving the mode in which read-side critical + * sections can occur. * - * Exit nohz mode, in other words, -enter- the mode in which RCU - * read-side critical sections normally occur. + * This code assumes that the idle loop never does anything that might + * result in unbalanced calls to irq_enter() and irq_exit(). If your + * architecture violates this assumption, RCU will give you what you + * deserve, good and hard. But very infrequently and irreproducibly. + * + * Use things like work queues to work around this limitation. + * + * You have been warned. */ -void rcu_exit_nohz(void) +void rcu_irq_exit(void) { unsigned long flags; struct rcu_dynticks *rdtp; local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); - if (rdtp->dynticks_nesting++) { - local_irq_restore(flags); + rdtp->dynticks_nesting--; + WARN_ON_ONCE(rdtp->dynticks_nesting < 0); + rcu_idle_enter_common(rdtp); + local_irq_restore(flags); +} + +/* + * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle + * + * If the new value of the ->dynticks_nesting counter was previously zero, + * we really have exited idle, and must do the appropriate accounting. + * The caller must have disabled interrupts. + */ +static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) +{ + if (oldval) { + trace_rcu_dyntick("++=", rdtp->dynticks_nesting); return; } smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ @@ -395,7 +439,71 @@ void rcu_exit_nohz(void) /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ smp_mb__after_atomic_inc(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); - trace_rcu_dyntick("End"); + trace_rcu_dyntick("End", oldval); + if (!idle_cpu(smp_processor_id())) { + WARN_ON_ONCE(1); /* must be idle task! */ + trace_rcu_dyntick("Error on exit: not idle task", oldval); + ftrace_dump(DUMP_ALL); + } +} + +/** + * rcu_idle_exit - inform RCU that current CPU is leaving idle + * + * Exit idle mode, in other words, -enter- the mode in which RCU + * read-side critical sections can occur. + * + * We crowbar the ->dynticks_nesting field to LLONG_MAX/2 to allow for + * the possibility of usermode upcalls messing up our count + * of interrupt nesting level during the busy period that is just + * now starting. + */ +void rcu_idle_exit(void) +{ + unsigned long flags; + struct rcu_dynticks *rdtp; + long long oldval; + + local_irq_save(flags); + rdtp = &__get_cpu_var(rcu_dynticks); + oldval = rdtp->dynticks_nesting; + WARN_ON_ONCE(oldval != 0); + rdtp->dynticks_nesting = LLONG_MAX / 2; + rcu_idle_exit_common(rdtp, oldval); + local_irq_restore(flags); +} + +/** + * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle + * + * Enter an interrupt handler, which might possibly result in exiting + * idle mode, in other words, entering the mode in which read-side critical + * sections can occur. + * + * Note that the Linux kernel is fully capable of entering an interrupt + * handler that it never exits, for example when doing upcalls to + * user mode! This code assumes that the idle loop never does upcalls to + * user mode. If your architecture does do upcalls from the idle loop (or + * does anything else that results in unbalanced calls to the irq_enter() + * and irq_exit() functions), RCU will give you what you deserve, good + * and hard. But very infrequently and irreproducibly. + * + * Use things like work queues to work around this limitation. + * + * You have been warned. + */ +void rcu_irq_enter(void) +{ + unsigned long flags; + struct rcu_dynticks *rdtp; + long long oldval; + + local_irq_save(flags); + rdtp = &__get_cpu_var(rcu_dynticks); + oldval = rdtp->dynticks_nesting; + rdtp->dynticks_nesting++; + WARN_ON_ONCE(rdtp->dynticks_nesting == 0); + rcu_idle_exit_common(rdtp, oldval); local_irq_restore(flags); } @@ -442,27 +550,32 @@ void rcu_nmi_exit(void) WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); } +#ifdef CONFIG_PROVE_RCU + /** - * rcu_irq_enter - inform RCU of entry to hard irq context + * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle * - * If the CPU was idle with dynamic ticks active, this updates the - * rdtp->dynticks to let the RCU handling know that the CPU is active. + * If the current CPU is in its idle loop and is neither in an interrupt + * or NMI handler, return true. The caller must have at least disabled + * preemption. */ -void rcu_irq_enter(void) +int rcu_is_cpu_idle(void) { - rcu_exit_nohz(); + return (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; } +#endif /* #ifdef CONFIG_PROVE_RCU */ + /** - * rcu_irq_exit - inform RCU of exit from hard irq context + * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle * - * If the CPU was idle with dynamic ticks active, update the rdp->dynticks - * to put let the RCU handling be aware that the CPU is going back to idle - * with no ticks. + * If the current CPU is idle or running at a first-level (not nested) + * interrupt from idle, return true. The caller must have at least + * disabled preemption. */ -void rcu_irq_exit(void) +int rcu_is_cpu_rrupt_from_idle(void) { - rcu_enter_nohz(); + return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; } #ifdef CONFIG_SMP @@ -512,24 +625,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) #endif /* #ifdef CONFIG_SMP */ -#else /* #ifdef CONFIG_NO_HZ */ - -#ifdef CONFIG_SMP - -static int dyntick_save_progress_counter(struct rcu_data *rdp) -{ - return 0; -} - -static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) -{ - return rcu_implicit_offline_qs(rdp); -} - -#endif /* #ifdef CONFIG_SMP */ - -#endif /* #else #ifdef CONFIG_NO_HZ */ - int rcu_cpu_stall_suppress __read_mostly; static void record_gp_stall_check_time(struct rcu_state *rsp) @@ -1334,16 +1429,14 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). * Also schedule RCU core processing. * - * This function must be called with hardirqs disabled. It is normally + * This function must be called from hardirq context. It is normally * invoked from the scheduling-clock interrupt. If rcu_pending returns * false, there is no point in invoking rcu_check_callbacks(). */ void rcu_check_callbacks(int cpu, int user) { trace_rcu_utilization("Start scheduler-tick"); - if (user || - (idle_cpu(cpu) && rcu_scheduler_active && - !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + if (user || rcu_is_cpu_rrupt_from_idle()) { /* * Get here if this CPU took its interrupt from user @@ -1913,9 +2006,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; rdp->qlen = 0; -#ifdef CONFIG_NO_HZ rdp->dynticks = &per_cpu(rcu_dynticks, cpu); -#endif /* #ifdef CONFIG_NO_HZ */ + WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != LLONG_MAX / 2); + WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); rdp->cpu = cpu; rdp->rsp = rsp; raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -1942,6 +2035,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; + WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != LLONG_MAX / 2); + WARN_ON_ONCE((atomic_read(&rdp->dynticks->dynticks) & 0x1) != 1); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ /* diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 517f2f89a29323..0963fa1541acd0 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -84,9 +84,10 @@ * Dynticks per-CPU state. */ struct rcu_dynticks { - int dynticks_nesting; /* Track irq/process nesting level. */ - int dynticks_nmi_nesting; /* Track NMI nesting level. */ - atomic_t dynticks; /* Even value for dynticks-idle, else odd. */ + long long dynticks_nesting; /* Track irq/process nesting level. */ + /* Process level is worth LLONG_MAX/2. */ + int dynticks_nmi_nesting; /* Track NMI nesting level. */ + atomic_t dynticks; /* Even value for idle, else odd. */ }; /* RCU's kthread states for tracing. */ @@ -274,16 +275,12 @@ struct rcu_data { /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ -#ifdef CONFIG_NO_HZ /* 3) dynticks interface. */ struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ int dynticks_snap; /* Per-GP tracking for dynticks. */ -#endif /* #ifdef CONFIG_NO_HZ */ /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ -#ifdef CONFIG_NO_HZ unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ -#endif /* #ifdef CONFIG_NO_HZ */ unsigned long offline_fqs; /* Kicked due to being offline. */ unsigned long resched_ipi; /* Sent a resched IPI. */ @@ -307,11 +304,7 @@ struct rcu_data { #define RCU_GP_INIT 1 /* Grace period being initialized. */ #define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ -#ifdef CONFIG_NO_HZ #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK -#else /* #ifdef CONFIG_NO_HZ */ -#define RCU_SIGNAL_INIT RCU_FORCE_QS -#endif /* #else #ifdef CONFIG_NO_HZ */ #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 59c7bee4ce0f78..654cfe67f0d1fe 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -67,13 +67,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->completed, rdp->gpnum, rdp->passed_quiesce, rdp->passed_quiesce_gpnum, rdp->qs_pending); -#ifdef CONFIG_NO_HZ - seq_printf(m, " dt=%d/%d/%d df=%lu", + seq_printf(m, " dt=%d/%llx/%d df=%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); -#endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); seq_printf(m, " ql=%ld qs=%c%c%c%c", rdp->qlen, @@ -141,13 +139,11 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->completed, rdp->gpnum, rdp->passed_quiesce, rdp->passed_quiesce_gpnum, rdp->qs_pending); -#ifdef CONFIG_NO_HZ - seq_printf(m, ",%d,%d,%d,%lu", + seq_printf(m, ",%d,%llx,%d,%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); -#endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != @@ -171,9 +167,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) static int show_rcudata_csv(struct seq_file *m, void *unused) { seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); -#ifdef CONFIG_NO_HZ seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); -#endif /* #ifdef CONFIG_NO_HZ */ seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); #ifdef CONFIG_RCU_BOOST seq_puts(m, "\"kt\",\"ktl\""); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 40420644d0baa3..5d9d23665f12fe 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -434,7 +434,6 @@ void tick_nohz_stop_sched_tick(int inidle) ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; - rcu_enter_nohz(); } ts->idle_sleeps++; @@ -473,6 +472,8 @@ void tick_nohz_stop_sched_tick(int inidle) ts->last_jiffies = last_jiffies; ts->sleep_length = ktime_sub(dev->next_event, now); end: + if (inidle) + rcu_idle_enter(); local_irq_restore(flags); } @@ -529,6 +530,7 @@ void tick_nohz_restart_sched_tick(void) ktime_t now; local_irq_disable(); + rcu_idle_exit(); if (ts->idle_active || (ts->inidle && ts->tick_stopped)) now = ktime_get(); @@ -543,8 +545,6 @@ void tick_nohz_restart_sched_tick(void) ts->inidle = 0; - rcu_exit_nohz(); - /* Update jiffies first */ select_nohz_load_balancer(0); tick_do_update_jiffies64(now); From a8eecf2248a45bf69f0625b23c003ad2ccd765ee Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 2 Oct 2011 11:01:15 -0700 Subject: [PATCH 06/64] trace: Allow ftrace_dump() to be called from modules Add an EXPORT_SYMBOL_GPL() so that rcutorture can dump the trace buffer upon detection of an RCU error. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f2bd275bb60f5c..a043d224adf65c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4775,6 +4775,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { __ftrace_dump(true, oops_dump_mode); } +EXPORT_SYMBOL_GPL(ftrace_dump); __init static int tracer_alloc_buffers(void) { From 91afaf300269aa99a4d646969b3258b74294ac4d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 2 Oct 2011 07:44:32 -0700 Subject: [PATCH 07/64] rcu: Add failure tracing to rcutorture Trace the rcutorture RCU accesses and dump the trace buffer when the first failure is detected. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 8 ++++++++ include/trace/events/rcu.h | 26 ++++++++++++++++++++++++++ kernel/rcupdate.c | 10 ++++++++++ kernel/rcutorture.c | 18 ++++++++++++++++++ 4 files changed, 62 insertions(+) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index cd1ad4b04c6d69..8d315b013e3737 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -51,6 +51,8 @@ extern int rcutorture_runnable; /* for sysctl */ #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) extern void rcutorture_record_test_transition(void); extern void rcutorture_record_progress(unsigned long vernum); +extern void do_trace_rcu_torture_read(char *rcutorturename, + struct rcu_head *rhp); #else static inline void rcutorture_record_test_transition(void) { @@ -58,6 +60,12 @@ static inline void rcutorture_record_test_transition(void) static inline void rcutorture_record_progress(unsigned long vernum) { } +#ifdef CONFIG_RCU_TRACE +extern void do_trace_rcu_torture_read(char *rcutorturename, + struct rcu_head *rhp); +#else +#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) +#endif #endif #define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b)) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index e5771804c5073b..172620a92b1a86 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -437,6 +437,31 @@ TRACE_EVENT(rcu_batch_end, __entry->rcuname, __entry->callbacks_invoked) ); +/* + * Tracepoint for rcutorture readers. The first argument is the name + * of the RCU flavor from rcutorture's viewpoint and the second argument + * is the callback address. + */ +TRACE_EVENT(rcu_torture_read, + + TP_PROTO(char *rcutorturename, struct rcu_head *rhp), + + TP_ARGS(rcutorturename, rhp), + + TP_STRUCT__entry( + __field(char *, rcutorturename) + __field(struct rcu_head *, rhp) + ), + + TP_fast_assign( + __entry->rcutorturename = rcutorturename; + __entry->rhp = rhp; + ), + + TP_printk("%s torture read %p", + __entry->rcutorturename, __entry->rhp) +); + #else /* #ifdef CONFIG_RCU_TRACE */ #define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0) @@ -452,6 +477,7 @@ TRACE_EVENT(rcu_batch_end, #define trace_rcu_invoke_callback(rcuname, rhp) do { } while (0) #define trace_rcu_invoke_kfree_callback(rcuname, rhp, offset) do { } while (0) #define trace_rcu_batch_end(rcuname, callbacks_invoked) do { } while (0) +#define trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) #endif /* #else #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index c5b98e565aee28..92e771d7b44b7f 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -316,3 +316,13 @@ struct debug_obj_descr rcuhead_debug_descr = { }; EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ + +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) +void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp) +{ + trace_rcu_torture_read(rcutorturename, rhp); +} +EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); +#else +#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) +#endif diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 764825c2685c49..df35228e743b79 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -913,6 +913,18 @@ rcu_torture_fakewriter(void *arg) return 0; } +void rcutorture_trace_dump(void) +{ + static atomic_t beenhere = ATOMIC_INIT(0); + + if (atomic_read(&beenhere)) + return; + if (atomic_xchg(&beenhere, 1) != 0) + return; + do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL); + ftrace_dump(DUMP_ALL); +} + /* * RCU torture reader from timer handler. Dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The @@ -934,6 +946,7 @@ static void rcu_torture_timer(unsigned long unused) rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || srcu_read_lock_held(&srcu_ctl)); + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); if (p == NULL) { /* Leave because rcu_torture_writer is not yet underway */ cur_ops->readunlock(idx); @@ -951,6 +964,8 @@ static void rcu_torture_timer(unsigned long unused) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } + if (pipe_count > 1) + rcutorture_trace_dump(); __this_cpu_inc(rcu_torture_count[pipe_count]); completed = cur_ops->completed() - completed; if (completed > RCU_TORTURE_PIPE_LEN) { @@ -994,6 +1009,7 @@ rcu_torture_reader(void *arg) rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || srcu_read_lock_held(&srcu_ctl)); + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ cur_ops->readunlock(idx); @@ -1009,6 +1025,8 @@ rcu_torture_reader(void *arg) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } + if (pipe_count > 1) + rcutorture_trace_dump(); __this_cpu_inc(rcu_torture_count[pipe_count]); completed = cur_ops->completed() - completed; if (completed > RCU_TORTURE_PIPE_LEN) { From 2c01531f08f8ba663a20d8472a3032f6df133b6e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 2 Oct 2011 17:21:18 -0700 Subject: [PATCH 08/64] rcu: Document failing tick as cause of RCU CPU stall warning One of lclaudio's systems was seeing RCU CPU stall warnings from idle. These turned out to be caused by a bug that stopped scheduling-clock tick interrupts from being sent to a given CPU for several hundred seconds. This commit therefore updates the documentation to call this out as a possible cause for RCU CPU stall warnings. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- Documentation/RCU/stallwarn.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 4e959208f7363c..f3e0625f4290dd 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -101,6 +101,11 @@ o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that CONFIG_TREE_PREEMPT_RCU case, you might see stall-warning messages. +o A hardware or software issue shuts off the scheduler-clock + interrupt on a CPU that is not in dyntick-idle mode. This + problem really has happened, and seems to be most likely to + result in RCU CPU stall warnings for CONFIG_NO_HZ=n kernels. + o A bug in the RCU implementation. o A hardware failure. This is quite unlikely, but has occurred From 34240697d619c439c55f21989680024dcb604aab Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 3 Oct 2011 11:38:52 -0700 Subject: [PATCH 09/64] rcu: Disable preemption in rcu_is_cpu_idle() Because rcu_is_cpu_idle() is to be used to check for extended quiescent states in RCU-preempt read-side critical sections, it cannot assume that preemption is disabled. And preemption must be disabled when accessing the dyntick-idle state, because otherwise the following sequence of events could occur: 1. Task A on CPU 1 enters rcu_is_cpu_idle() and picks up the pointer to CPU 1's per-CPU variables. 2. Task B preempts Task A and starts running on CPU 1. 3. Task A migrates to CPU 2. 4. Task B blocks, leaving CPU 1 idle. 5. Task A continues execution on CPU 2, accessing CPU 1's dyntick-idle information using the pointer fetched in step 1 above, and finds that CPU 1 is idle. 6. Task A therefore incorrectly concludes that it is executing in an extended quiescent state, possibly issuing a spurious splat. Therefore, this commit disables preemption within the rcu_is_cpu_idle() function. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 1c40326724f65f..69b6cdd4f94488 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -556,12 +556,16 @@ void rcu_nmi_exit(void) * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle * * If the current CPU is in its idle loop and is neither in an interrupt - * or NMI handler, return true. The caller must have at least disabled - * preemption. + * or NMI handler, return true. */ int rcu_is_cpu_idle(void) { - return (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; + int ret; + + preempt_disable(); + ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; + preempt_enable(); + return ret; } #endif /* #ifdef CONFIG_PROVE_RCU */ From b40d293eb36ba40cd428b6d178db911174689702 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 22 Oct 2011 07:12:34 -0700 Subject: [PATCH 10/64] rcu: Omit self-awaken when setting up expedited grace period When setting up an expedited grace period, if there were no readers, the task will awaken itself. This commit removes this useless self-awakening. Signed-off-by: Thomas Gleixner Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 +- kernel/rcutree.h | 3 ++- kernel/rcutree_plugin.h | 16 +++++++++++----- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 69b6cdd4f94488..8afb2e89745bd6 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1320,7 +1320,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) else raw_spin_unlock_irqrestore(&rnp->lock, flags); if (need_report & RCU_OFL_TASKS_EXP_GP) - rcu_report_exp_rnp(rsp, rnp); + rcu_report_exp_rnp(rsp, rnp, true); rcu_node_kthread_setaffinity(rnp, -1); } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 0963fa1541acd0..fd2f87db2ab16f 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -444,7 +444,8 @@ static void rcu_preempt_check_callbacks(int cpu); static void rcu_preempt_process_callbacks(void); void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake); #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ static int rcu_preempt_pending(int cpu); static int rcu_preempt_needs_cpu(int cpu); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 708dc579634d4d..0f095d1cc16db1 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -410,7 +410,7 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) * then we need to report up the rcu_node hierarchy. */ if (!empty_exp && empty_exp_now) - rcu_report_exp_rnp(&rcu_preempt_state, rnp); + rcu_report_exp_rnp(&rcu_preempt_state, rnp, true); } else { local_irq_restore(flags); } @@ -732,9 +732,13 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) * + * Most callers will set the "wake" flag, but the task initiating the + * expedited grace period need not wake itself. + * * Caller must hold sync_rcu_preempt_exp_mutex. */ -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake) { unsigned long flags; unsigned long mask; @@ -747,7 +751,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) } if (rnp->parent == NULL) { raw_spin_unlock_irqrestore(&rnp->lock, flags); - wake_up(&sync_rcu_preempt_exp_wq); + if (wake) + wake_up(&sync_rcu_preempt_exp_wq); break; } mask = rnp->grpmask; @@ -780,7 +785,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) must_wait = 1; } if (!must_wait) - rcu_report_exp_rnp(rsp, rnp); + rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ } /* @@ -1072,7 +1077,8 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); * report on tasks preempted in RCU read-side critical sections during * expedited RCU grace periods. */ -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake) { return; } From a0f8eefb127f5be07628954f310a7fc8c82b2fc3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 3 Nov 2011 12:08:17 -0700 Subject: [PATCH 11/64] rcu: Remove redundant return from rcu_report_exp_rnp() Empty void functions do not need "return", so this commit removes it from rcu_report_exp_rnp(). Signed-off-by: Thomas Gleixner Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 0f095d1cc16db1..7a7961feeecf2e 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1080,7 +1080,6 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake) { - return; } #endif /* #ifdef CONFIG_HOTPLUG_CPU */ From e6b80a3b0994ea6c3d876d72464f2debbfcfeb05 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Oct 2011 16:25:18 -0700 Subject: [PATCH 12/64] rcu: Detect illegal rcu dereference in extended quiescent state Report that none of the rcu read lock maps are held while in an RCU extended quiescent state (the section between rcu_idle_enter() and rcu_idle_exit()). This helps detect any use of rcu_dereference() and friends from within the section in idle where RCU is not allowed. This way we can guarantee an extended quiescent window where the CPU can be put in dyntick idle mode or can simply aoid to be part of any global grace period completion while in the idle loop. Uses of RCU from such mode are totally ignored by RCU, hence the importance of these checks. Signed-off-by: Frederic Weisbecker Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Lai Jiangshan Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 26 ++++++++++++++++++++++++++ kernel/rcupdate.c | 2 ++ kernel/rcutiny.c | 1 + kernel/rcutree.c | 1 + 4 files changed, 30 insertions(+) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 8d315b013e3737..bf91fcfe181cee 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -228,6 +228,15 @@ static inline void destroy_rcu_head_on_stack(struct rcu_head *head) #ifdef CONFIG_DEBUG_LOCK_ALLOC +#ifdef CONFIG_PROVE_RCU +extern int rcu_is_cpu_idle(void); +#else /* !CONFIG_PROVE_RCU */ +static inline int rcu_is_cpu_idle(void) +{ + return 0; +} +#endif /* else !CONFIG_PROVE_RCU */ + extern struct lockdep_map rcu_lock_map; # define rcu_read_acquire() \ lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_) @@ -262,6 +271,8 @@ static inline int rcu_read_lock_held(void) { if (!debug_lockdep_rcu_enabled()) return 1; + if (rcu_is_cpu_idle()) + return 0; return lock_is_held(&rcu_lock_map); } @@ -285,6 +296,19 @@ extern int rcu_read_lock_bh_held(void); * * Check debug_lockdep_rcu_enabled() to prevent false positives during boot * and while lockdep is disabled. + * + * Note that if the CPU is in the idle loop from an RCU point of + * view (ie: that we are in the section between rcu_idle_enter() and + * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU + * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs + * that are in such a section, considering these as in extended quiescent + * state, so such a CPU is effectively never in an RCU read-side critical + * section regardless of what RCU primitives it invokes. This state of + * affairs is required --- we need to keep an RCU-free window in idle + * where the CPU may possibly enter into low power mode. This way we can + * notice an extended quiescent state to other CPUs that started a grace + * period. Otherwise we would delay any grace period as long as we run in + * the idle task. */ #ifdef CONFIG_PREEMPT_COUNT static inline int rcu_read_lock_sched_held(void) @@ -293,6 +317,8 @@ static inline int rcu_read_lock_sched_held(void) if (!debug_lockdep_rcu_enabled()) return 1; + if (rcu_is_cpu_idle()) + return 0; if (debug_locks) lockdep_opinion = lock_is_held(&rcu_sched_lock_map); return lockdep_opinion || preempt_count() != 0 || irqs_disabled(); diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 92e771d7b44b7f..2bc4e135ff23d6 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -93,6 +93,8 @@ int rcu_read_lock_bh_held(void) { if (!debug_lockdep_rcu_enabled()) return 1; + if (rcu_is_cpu_idle()) + return 0; return in_softirq() || irqs_disabled(); } EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 3ab77bdc90c4a1..b4e0b498176869 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -157,6 +157,7 @@ int rcu_is_cpu_idle(void) { return !rcu_dynticks_nesting; } +EXPORT_SYMBOL(rcu_is_cpu_idle); #endif /* #ifdef CONFIG_PROVE_RCU */ diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 8afb2e89745bd6..489b62a67d359a 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -567,6 +567,7 @@ int rcu_is_cpu_idle(void) preempt_enable(); return ret; } +EXPORT_SYMBOL(rcu_is_cpu_idle); #endif /* #ifdef CONFIG_PROVE_RCU */ From 0464e937485f15d2add78e3b0f498469f4e6600d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Oct 2011 18:22:01 +0200 Subject: [PATCH 13/64] rcu: Inform the user about extended quiescent state on PROVE_RCU warning Inform the user if an RCU usage error is detected by lockdep while in an extended quiescent state (in this case, the RCU-free window in idle). This is accomplished by adding a line to the RCU lockdep splat indicating whether or not the splat occurred in extended quiescent state. Uses of RCU from within extended quiescent state mode are totally ignored by RCU, hence the importance of this diagnostic. Signed-off-by: Frederic Weisbecker Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Lai Jiangshan Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/lockdep.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index b2e08c932d91c6..f45c6817770e75 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -4170,6 +4170,28 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) printk("%s:%d %s!\n", file, line, s); printk("\nother info that might help us debug this:\n\n"); printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); + + /* + * If a CPU is in the RCU-free window in idle (ie: in the section + * between rcu_idle_enter() and rcu_idle_exit(), then RCU + * considers that CPU to be in an "extended quiescent state", + * which means that RCU will be completely ignoring that CPU. + * Therefore, rcu_read_lock() and friends have absolutely no + * effect on a CPU running in that state. In other words, even if + * such an RCU-idle CPU has called rcu_read_lock(), RCU might well + * delete data structures out from under it. RCU really has no + * choice here: we need to keep an RCU-free window in idle where + * the CPU may possibly enter into low power mode. This way we can + * notice an extended quiescent state to other CPUs that started a grace + * period. Otherwise we would delay any grace period as long as we run + * in the idle task. + * + * So complain bitterly if someone does call rcu_read_lock(), + * rcu_read_lock_bh() and so on from extended quiescent states. + */ + if (rcu_is_cpu_idle()) + printk("RCU used illegally from extended quiescent state!\n"); + lockdep_print_held_locks(curr); printk("\nstack backtrace:\n"); dump_stack(); From 00f49e5729af602deb559b0cf293a00b625e8636 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Oct 2011 18:22:02 +0200 Subject: [PATCH 14/64] rcu: Warn when rcu_read_lock() is used in extended quiescent state We are currently able to detect uses of rcu_dereference_check() inside extended quiescent states (such as the RCU-free window in idle). But rcu_read_lock() and friends can be used without rcu_dereference(), so that the earlier commit checking for use of rcu_dereference() and friends while in RCU idle mode miss some error conditions. This commit therefore adds extended quiescent state checking to rcu_read_lock() and friends. Uses of RCU from within RCU-idle mode are totally ignored by RCU, hence the importance of these checks. Signed-off-by: Frederic Weisbecker Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Lai Jiangshan Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 52 ++++++++++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index bf91fcfe181cee..d201c155f70c27 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -237,21 +237,53 @@ static inline int rcu_is_cpu_idle(void) } #endif /* else !CONFIG_PROVE_RCU */ +static inline void rcu_lock_acquire(struct lockdep_map *map) +{ + WARN_ON_ONCE(rcu_is_cpu_idle()); + lock_acquire(map, 0, 0, 2, 1, NULL, _THIS_IP_); +} + +static inline void rcu_lock_release(struct lockdep_map *map) +{ + WARN_ON_ONCE(rcu_is_cpu_idle()); + lock_release(map, 1, _THIS_IP_); +} + extern struct lockdep_map rcu_lock_map; -# define rcu_read_acquire() \ - lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_) -# define rcu_read_release() lock_release(&rcu_lock_map, 1, _THIS_IP_) + +static inline void rcu_read_acquire(void) +{ + rcu_lock_acquire(&rcu_lock_map); +} + +static inline void rcu_read_release(void) +{ + rcu_lock_release(&rcu_lock_map); +} extern struct lockdep_map rcu_bh_lock_map; -# define rcu_read_acquire_bh() \ - lock_acquire(&rcu_bh_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_) -# define rcu_read_release_bh() lock_release(&rcu_bh_lock_map, 1, _THIS_IP_) + +static inline void rcu_read_acquire_bh(void) +{ + rcu_lock_acquire(&rcu_bh_lock_map); +} + +static inline void rcu_read_release_bh(void) +{ + rcu_lock_release(&rcu_bh_lock_map); +} extern struct lockdep_map rcu_sched_lock_map; -# define rcu_read_acquire_sched() \ - lock_acquire(&rcu_sched_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_) -# define rcu_read_release_sched() \ - lock_release(&rcu_sched_lock_map, 1, _THIS_IP_) + +static inline void rcu_read_acquire_sched(void) +{ + rcu_lock_acquire(&rcu_sched_lock_map); +} + +static inline void rcu_read_release_sched(void) +{ + rcu_lock_release(&rcu_sched_lock_map); +} extern int debug_lockdep_rcu_enabled(void); From d8ab29f8be918b34a1ccd174569a53f0eb04b0a5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 7 Oct 2011 18:22:03 +0200 Subject: [PATCH 15/64] rcu: Remove one layer of abstraction from PROVE_RCU checking Simplify things a bit by substituting the definitions of the single-line rcu_read_acquire(), rcu_read_release(), rcu_read_acquire_bh(), rcu_read_release_bh(), rcu_read_acquire_sched(), and rcu_read_release_sched() functions at their call points. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 53 ++++++---------------------------------- 1 file changed, 8 insertions(+), 45 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index d201c155f70c27..5dd6fd8b3203f6 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -250,41 +250,8 @@ static inline void rcu_lock_release(struct lockdep_map *map) } extern struct lockdep_map rcu_lock_map; - -static inline void rcu_read_acquire(void) -{ - rcu_lock_acquire(&rcu_lock_map); -} - -static inline void rcu_read_release(void) -{ - rcu_lock_release(&rcu_lock_map); -} - extern struct lockdep_map rcu_bh_lock_map; - -static inline void rcu_read_acquire_bh(void) -{ - rcu_lock_acquire(&rcu_bh_lock_map); -} - -static inline void rcu_read_release_bh(void) -{ - rcu_lock_release(&rcu_bh_lock_map); -} - extern struct lockdep_map rcu_sched_lock_map; - -static inline void rcu_read_acquire_sched(void) -{ - rcu_lock_acquire(&rcu_sched_lock_map); -} - -static inline void rcu_read_release_sched(void) -{ - rcu_lock_release(&rcu_sched_lock_map); -} - extern int debug_lockdep_rcu_enabled(void); /** @@ -364,12 +331,8 @@ static inline int rcu_read_lock_sched_held(void) #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -# define rcu_read_acquire() do { } while (0) -# define rcu_read_release() do { } while (0) -# define rcu_read_acquire_bh() do { } while (0) -# define rcu_read_release_bh() do { } while (0) -# define rcu_read_acquire_sched() do { } while (0) -# define rcu_read_release_sched() do { } while (0) +# define rcu_lock_acquire(a) do { } while (0) +# define rcu_lock_release(a) do { } while (0) static inline int rcu_read_lock_held(void) { @@ -690,7 +653,7 @@ static inline void rcu_read_lock(void) { __rcu_read_lock(); __acquire(RCU); - rcu_read_acquire(); + rcu_lock_acquire(&rcu_lock_map); } /* @@ -710,7 +673,7 @@ static inline void rcu_read_lock(void) */ static inline void rcu_read_unlock(void) { - rcu_read_release(); + rcu_lock_release(&rcu_lock_map); __release(RCU); __rcu_read_unlock(); } @@ -731,7 +694,7 @@ static inline void rcu_read_lock_bh(void) { local_bh_disable(); __acquire(RCU_BH); - rcu_read_acquire_bh(); + rcu_lock_acquire(&rcu_bh_lock_map); } /* @@ -741,7 +704,7 @@ static inline void rcu_read_lock_bh(void) */ static inline void rcu_read_unlock_bh(void) { - rcu_read_release_bh(); + rcu_lock_release(&rcu_bh_lock_map); __release(RCU_BH); local_bh_enable(); } @@ -758,7 +721,7 @@ static inline void rcu_read_lock_sched(void) { preempt_disable(); __acquire(RCU_SCHED); - rcu_read_acquire_sched(); + rcu_lock_acquire(&rcu_sched_lock_map); } /* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */ @@ -775,7 +738,7 @@ static inline notrace void rcu_read_lock_sched_notrace(void) */ static inline void rcu_read_unlock_sched(void) { - rcu_read_release_sched(); + rcu_lock_release(&rcu_sched_lock_map); __release(RCU_SCHED); preempt_enable(); } From ff195cb69ba8d2af9b891be3a26db95fe1999d43 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 7 Oct 2011 18:22:04 +0200 Subject: [PATCH 16/64] rcu: Warn when srcu_read_lock() is used in an extended quiescent state Catch SRCU up to the other variants of RCU by making PROVE_RCU complain if either srcu_read_lock() or srcu_read_lock_held() are used from within RCU-idle mode. Frederic reworked this to allow for the new versions of his patches that check for extended quiescent states. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/srcu.h | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 58971e891f4899..4e0a3d41dae3a4 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -28,6 +28,7 @@ #define _LINUX_SRCU_H #include +#include struct srcu_struct_array { int c[2]; @@ -60,18 +61,10 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name, __init_srcu_struct((sp), #sp, &__srcu_key); \ }) -# define srcu_read_acquire(sp) \ - lock_acquire(&(sp)->dep_map, 0, 0, 2, 1, NULL, _THIS_IP_) -# define srcu_read_release(sp) \ - lock_release(&(sp)->dep_map, 1, _THIS_IP_) - #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ int init_srcu_struct(struct srcu_struct *sp); -# define srcu_read_acquire(sp) do { } while (0) -# define srcu_read_release(sp) do { } while (0) - #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ void cleanup_srcu_struct(struct srcu_struct *sp); @@ -90,12 +83,29 @@ long srcu_batches_completed(struct srcu_struct *sp); * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, * this assumes we are in an SRCU read-side critical section unless it can * prove otherwise. + * + * Note that if the CPU is in the idle loop from an RCU point of view + * (ie: that we are in the section between rcu_idle_enter() and + * rcu_idle_exit()) then srcu_read_lock_held() returns false even if + * the CPU did an srcu_read_lock(). The reason for this is that RCU + * ignores CPUs that are in such a section, considering these as in + * extended quiescent state, so such a CPU is effectively never in an + * RCU read-side critical section regardless of what RCU primitives it + * invokes. This state of affairs is required --- we need to keep an + * RCU-free window in idle where the CPU may possibly enter into low + * power mode. This way we can notice an extended quiescent state to + * other CPUs that started a grace period. Otherwise we would delay any + * grace period as long as we run in the idle task. */ static inline int srcu_read_lock_held(struct srcu_struct *sp) { - if (debug_locks) - return lock_is_held(&sp->dep_map); - return 1; + if (rcu_is_cpu_idle()) + return 0; + + if (!debug_locks) + return 1; + + return lock_is_held(&sp->dep_map); } #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ @@ -150,7 +160,7 @@ static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp) { int retval = __srcu_read_lock(sp); - srcu_read_acquire(sp); + rcu_lock_acquire(&(sp)->dep_map); return retval; } @@ -164,7 +174,7 @@ static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp) static inline void srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp) { - srcu_read_release(sp); + rcu_lock_release(&(sp)->dep_map); __srcu_read_unlock(sp, idx); } From 867f236bd12f5091df6dc7cc75f94d7fd982d78a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 7 Oct 2011 18:22:05 +0200 Subject: [PATCH 17/64] rcu: Make srcu_read_lock_held() call common lockdep-enabled function A common debug_lockdep_rcu_enabled() function is used to check whether RCU lockdep splats should be reported, but srcu_read_lock() does not use it. This commit therefore brings srcu_read_lock_held() up to date. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/srcu.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 4e0a3d41dae3a4..d4b12443b2efd8 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -84,6 +84,9 @@ long srcu_batches_completed(struct srcu_struct *sp); * this assumes we are in an SRCU read-side critical section unless it can * prove otherwise. * + * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot + * and while lockdep is disabled. + * * Note that if the CPU is in the idle loop from an RCU point of view * (ie: that we are in the section between rcu_idle_enter() and * rcu_idle_exit()) then srcu_read_lock_held() returns false even if @@ -102,7 +105,7 @@ static inline int srcu_read_lock_held(struct srcu_struct *sp) if (rcu_is_cpu_idle()) return 0; - if (!debug_locks) + if (!debug_lockdep_rcu_enabled()) return 1; return lock_is_held(&sp->dep_map); From 280f06774afedf849f0b34248ed6aff57d0f6908 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Oct 2011 18:22:06 +0200 Subject: [PATCH 18/64] nohz: Separate out irq exit and idle loop dyntick logic The tick_nohz_stop_sched_tick() function, which tries to delay the next timer tick as long as possible, can be called from two places: - From the idle loop to start the dytick idle mode - From interrupt exit if we have interrupted the dyntick idle mode, so that we reprogram the next tick event in case the irq changed some internal state that requires this action. There are only few minor differences between both that are handled by that function, driven by the ts->inidle cpu variable and the inidle parameter. The whole guarantees that we only update the dyntick mode on irq exit if we actually interrupted the dyntick idle mode, and that we enter in RCU extended quiescent state from idle loop entry only. Split this function into: - tick_nohz_idle_enter(), which sets ts->inidle to 1, enters dynticks idle mode unconditionally if it can, and enters into RCU extended quiescent state. - tick_nohz_irq_exit() which only updates the dynticks idle mode when ts->inidle is set (ie: if tick_nohz_idle_enter() has been called). To maintain symmetry, tick_nohz_restart_sched_tick() has been renamed into tick_nohz_idle_exit(). This simplifies the code and micro-optimize the irq exit path (no need for local_irq_save there). This also prepares for the split between dynticks and rcu extended quiescent state logics. We'll need this split to further fix illegal uses of RCU in extended quiescent states in the idle loop. Signed-off-by: Frederic Weisbecker Cc: Mike Frysinger Cc: Guan Xuetao Cc: David Miller Cc: Chris Metcalf Cc: Hans-Christian Egtvedt Cc: Ralf Baechle Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Russell King Cc: Paul Mackerras Cc: Heiko Carstens Cc: Paul Mundt Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- arch/arm/kernel/process.c | 4 +- arch/avr32/kernel/process.c | 4 +- arch/blackfin/kernel/process.c | 4 +- arch/microblaze/kernel/process.c | 4 +- arch/mips/kernel/process.c | 4 +- arch/openrisc/kernel/idle.c | 4 +- arch/powerpc/kernel/idle.c | 4 +- arch/powerpc/platforms/iseries/setup.c | 8 +-- arch/s390/kernel/process.c | 4 +- arch/sh/kernel/idle.c | 4 +- arch/sparc/kernel/process_64.c | 4 +- arch/tile/kernel/process.c | 4 +- arch/um/kernel/process.c | 4 +- arch/unicore32/kernel/process.c | 4 +- arch/x86/kernel/process_32.c | 4 +- arch/x86/kernel/process_64.c | 4 +- include/linux/tick.h | 13 ++-- kernel/softirq.c | 2 +- kernel/time/tick-sched.c | 93 ++++++++++++++++---------- 19 files changed, 99 insertions(+), 77 deletions(-) diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 3d0c6fb74ae4ef..3f1f8daf703cfa 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -183,7 +183,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); leds_event(led_idle_start); while (!need_resched()) { #ifdef CONFIG_HOTPLUG_CPU @@ -213,7 +213,7 @@ void cpu_idle(void) } } leds_event(led_idle_end); - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c index ef5a2a08fcca24..6ee7952248dbe5 100644 --- a/arch/avr32/kernel/process.c +++ b/arch/avr32/kernel/process.c @@ -34,10 +34,10 @@ void cpu_idle(void) { /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched()) cpu_idle_sleep(); - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c index 6a80a9e9fc4aef..7b141b5c9e8d62 100644 --- a/arch/blackfin/kernel/process.c +++ b/arch/blackfin/kernel/process.c @@ -88,10 +88,10 @@ void cpu_idle(void) #endif if (!idle) idle = default_idle; - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched()) idle(); - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index 95cc295976a732..5407f09b4be46c 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -103,10 +103,10 @@ void cpu_idle(void) if (!idle) idle = default_idle; - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched()) idle(); - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index c47f96e453c0fd..c11e5ca2a434a4 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -56,7 +56,7 @@ void __noreturn cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched() && cpu_online(cpu)) { #ifdef CONFIG_MIPS_MT_SMTC extern void smtc_idle_loop_hook(void); @@ -77,7 +77,7 @@ void __noreturn cpu_idle(void) system_state == SYSTEM_BOOTING)) play_dead(); #endif - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/openrisc/kernel/idle.c b/arch/openrisc/kernel/idle.c index d5bc5f813e89e5..fb6a9bf40006d1 100644 --- a/arch/openrisc/kernel/idle.c +++ b/arch/openrisc/kernel/idle.c @@ -51,7 +51,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched()) { check_pgt_cache(); @@ -69,7 +69,7 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); } - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index 39a2baa6ad58ae..878572f70ac5f5 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -56,7 +56,7 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched() && !cpu_should_die()) { ppc64_runlatch_off(); @@ -93,7 +93,7 @@ void cpu_idle(void) HMT_medium(); ppc64_runlatch_on(); - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); if (cpu_should_die()) cpu_die(); diff --git a/arch/powerpc/platforms/iseries/setup.c b/arch/powerpc/platforms/iseries/setup.c index ea0acbd8966ddc..e83dfaf89f6978 100644 --- a/arch/powerpc/platforms/iseries/setup.c +++ b/arch/powerpc/platforms/iseries/setup.c @@ -563,7 +563,7 @@ static void yield_shared_processor(void) static void iseries_shared_idle(void) { while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched() && !hvlpevent_is_pending()) { local_irq_disable(); ppc64_runlatch_off(); @@ -577,7 +577,7 @@ static void iseries_shared_idle(void) } ppc64_runlatch_on(); - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); if (hvlpevent_is_pending()) process_iSeries_events(); @@ -593,7 +593,7 @@ static void iseries_dedicated_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); if (!need_resched()) { while (!need_resched()) { ppc64_runlatch_off(); @@ -610,7 +610,7 @@ static void iseries_dedicated_idle(void) } ppc64_runlatch_on(); - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 9451b210a1b4f1..6224f9dbbc1f8c 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -91,10 +91,10 @@ static void default_idle(void) void cpu_idle(void) { for (;;) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched()) default_idle(); - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c index db4ecd731a0037..6015743020a07a 100644 --- a/arch/sh/kernel/idle.c +++ b/arch/sh/kernel/idle.c @@ -89,7 +89,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched()) { check_pgt_cache(); @@ -111,7 +111,7 @@ void cpu_idle(void) start_critical_timings(); } - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 3739a06a76cbfd..9c2795ba2cfe54 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -95,12 +95,12 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while(1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched() && !cpu_is_offline(cpu)) sparc64_yield(cpu); - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c index 9c45d8bbdf57a7..920e674aedb946 100644 --- a/arch/tile/kernel/process.c +++ b/arch/tile/kernel/process.c @@ -85,7 +85,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched()) { if (cpu_is_offline(cpu)) BUG(); /* no HOTPLUG_CPU */ @@ -105,7 +105,7 @@ void cpu_idle(void) local_irq_enable(); current_thread_info()->status |= TS_POLLING; } - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index c5338351aecd20..cfb657e9284912 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -246,10 +246,10 @@ void default_idle(void) if (need_resched()) schedule(); - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); nsecs = disable_timer(); idle_sleep(nsecs); - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); } } diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c index ba401df971ed80..9999b9a84d4677 100644 --- a/arch/unicore32/kernel/process.c +++ b/arch/unicore32/kernel/process.c @@ -55,7 +55,7 @@ void cpu_idle(void) { /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched()) { local_irq_disable(); stop_critical_timings(); @@ -63,7 +63,7 @@ void cpu_idle(void) local_irq_enable(); start_critical_timings(); } - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 795b79f984c230..6d9d4d52cac5af 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -99,7 +99,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched()) { check_pgt_cache(); @@ -116,7 +116,7 @@ void cpu_idle(void) pm_idle(); start_critical_timings(); } - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3bd7e6eebf316a..b069e9d7875f7c 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -122,7 +122,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched()) { rmb(); @@ -149,7 +149,7 @@ void cpu_idle(void) __exit_idle(); } - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/include/linux/tick.h b/include/linux/tick.h index ca40838fdfb723..0df1d50a408ae1 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -121,21 +121,22 @@ static inline int tick_oneshot_mode_active(void) { return 0; } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ # ifdef CONFIG_NO_HZ -extern void tick_nohz_stop_sched_tick(int inidle); -extern void tick_nohz_restart_sched_tick(void); +extern void tick_nohz_idle_enter(void); +extern void tick_nohz_idle_exit(void); +extern void tick_nohz_irq_exit(void); extern ktime_t tick_nohz_get_sleep_length(void); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); # else -static inline void tick_nohz_stop_sched_tick(int inidle) +static inline void tick_nohz_idle_enter(void) { - if (inidle) - rcu_idle_enter(); + rcu_idle_enter(); } -static inline void tick_nohz_restart_sched_tick(void) +static inline void tick_nohz_idle_exit(void) { rcu_idle_exit(); } + static inline ktime_t tick_nohz_get_sleep_length(void) { ktime_t len = { .tv64 = NSEC_PER_SEC/HZ }; diff --git a/kernel/softirq.c b/kernel/softirq.c index 2c71d91efff0f8..f9f2aa81ce53af 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -351,7 +351,7 @@ void irq_exit(void) #ifdef CONFIG_NO_HZ /* Make sure that timer wheel updates are propagated */ if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) - tick_nohz_stop_sched_tick(0); + tick_nohz_irq_exit(); #endif preempt_enable_no_resched(); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 5d9d23665f12fe..266c242dc354dd 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -275,42 +275,17 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); -/** - * tick_nohz_stop_sched_tick - stop the idle tick from the idle task - * - * When the next event is more than a tick into the future, stop the idle tick - * Called either from the idle loop or from irq_exit() when an idle period was - * just interrupted by an interrupt which did not cause a reschedule. - */ -void tick_nohz_stop_sched_tick(int inidle) +static void tick_nohz_stop_sched_tick(struct tick_sched *ts) { - unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; - struct tick_sched *ts; + unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; ktime_t last_update, expires, now; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; int cpu; - local_irq_save(flags); - cpu = smp_processor_id(); ts = &per_cpu(tick_cpu_sched, cpu); - /* - * Call to tick_nohz_start_idle stops the last_update_time from being - * updated. Thus, it must not be called in the event we are called from - * irq_exit() with the prior state different than idle. - */ - if (!inidle && !ts->inidle) - goto end; - - /* - * Set ts->inidle unconditionally. Even if the system did not - * switch to NOHZ mode the cpu frequency governers rely on the - * update of the idle time accounting in tick_nohz_start_idle(). - */ - ts->inidle = 1; - now = tick_nohz_start_idle(cpu, ts); /* @@ -326,10 +301,10 @@ void tick_nohz_stop_sched_tick(int inidle) } if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) - goto end; + return; if (need_resched()) - goto end; + return; if (unlikely(local_softirq_pending() && cpu_online(cpu))) { static int ratelimit; @@ -339,7 +314,7 @@ void tick_nohz_stop_sched_tick(int inidle) (unsigned int) local_softirq_pending()); ratelimit++; } - goto end; + return; } ts->idle_calls++; @@ -471,10 +446,54 @@ void tick_nohz_stop_sched_tick(int inidle) ts->next_jiffies = next_jiffies; ts->last_jiffies = last_jiffies; ts->sleep_length = ktime_sub(dev->next_event, now); -end: - if (inidle) - rcu_idle_enter(); - local_irq_restore(flags); +} + +/** + * tick_nohz_idle_enter - stop the idle tick from the idle task + * + * When the next event is more than a tick into the future, stop the idle tick + * Called when we start the idle loop. + * This also enters into RCU extended quiescent state so that this CPU doesn't + * need anymore to be part of any global grace period completion. This way + * the tick can be stopped safely as we don't need to report quiescent states. + */ +void tick_nohz_idle_enter(void) +{ + struct tick_sched *ts; + + WARN_ON_ONCE(irqs_disabled()); + + local_irq_disable(); + + ts = &__get_cpu_var(tick_cpu_sched); + /* + * set ts->inidle unconditionally. even if the system did not + * switch to nohz mode the cpu frequency governers rely on the + * update of the idle time accounting in tick_nohz_start_idle(). + */ + ts->inidle = 1; + tick_nohz_stop_sched_tick(ts); + rcu_idle_enter(); + + local_irq_enable(); +} + +/** + * tick_nohz_irq_exit - update next tick event from interrupt exit + * + * When an interrupt fires while we are idle and it doesn't cause + * a reschedule, it may still add, modify or delete a timer, enqueue + * an RCU callback, etc... + * So we need to re-calculate and reprogram the next tick event. + */ +void tick_nohz_irq_exit(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + + if (!ts->inidle) + return; + + tick_nohz_stop_sched_tick(ts); } /** @@ -516,11 +535,13 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) } /** - * tick_nohz_restart_sched_tick - restart the idle tick from the idle task + * tick_nohz_idle_exit - restart the idle tick from the idle task * * Restart the idle tick when the CPU is woken up from idle + * This also exit the RCU extended quiescent state. The CPU + * can use RCU again after this function is called. */ -void tick_nohz_restart_sched_tick(void) +void tick_nohz_idle_exit(void) { int cpu = smp_processor_id(); struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); From 2bbb6817c0ac1b5f2a68d720f364f98eeb1ac4fd Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Oct 2011 16:01:00 +0200 Subject: [PATCH 19/64] nohz: Allow rcu extended quiescent state handling seperately from tick stop It is assumed that rcu won't be used once we switch to tickless mode and until we restart the tick. However this is not always true, as in x86-64 where we dereference the idle notifiers after the tick is stopped. To prepare for fixing this, add two new APIs: tick_nohz_idle_enter_norcu() and tick_nohz_idle_exit_norcu(). If no use of RCU is made in the idle loop between tick_nohz_enter_idle() and tick_nohz_exit_idle() calls, the arch must instead call the new *_norcu() version such that the arch doesn't need to call rcu_idle_enter() and rcu_idle_exit(). Otherwise the arch must call tick_nohz_enter_idle() and tick_nohz_exit_idle() and also call explicitly: - rcu_idle_enter() after its last use of RCU before the CPU is put to sleep. - rcu_idle_exit() before the first use of RCU after the CPU is woken up. Signed-off-by: Frederic Weisbecker Cc: Mike Frysinger Cc: Guan Xuetao Cc: David Miller Cc: Chris Metcalf Cc: Hans-Christian Egtvedt Cc: Ralf Baechle Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Russell King Cc: Paul Mackerras Cc: Heiko Carstens Cc: Paul Mundt Signed-off-by: Paul E. McKenney --- arch/arm/kernel/process.c | 4 +-- arch/avr32/kernel/process.c | 4 +-- arch/blackfin/kernel/process.c | 4 +-- arch/microblaze/kernel/process.c | 4 +-- arch/mips/kernel/process.c | 4 +-- arch/openrisc/kernel/idle.c | 4 +-- arch/powerpc/kernel/idle.c | 4 +-- arch/powerpc/platforms/iseries/setup.c | 8 ++--- arch/s390/kernel/process.c | 4 +-- arch/sh/kernel/idle.c | 4 +-- arch/sparc/kernel/process_64.c | 4 +-- arch/tile/kernel/process.c | 4 +-- arch/um/kernel/process.c | 4 +-- arch/unicore32/kernel/process.c | 4 +-- arch/x86/kernel/process_32.c | 4 +-- arch/x86/kernel/process_64.c | 4 +-- include/linux/tick.h | 46 ++++++++++++++++++++++++-- kernel/time/tick-sched.c | 25 +++++++------- 18 files changed, 90 insertions(+), 49 deletions(-) diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 3f1f8daf703cfa..47e34c091276f4 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -183,7 +183,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); leds_event(led_idle_start); while (!need_resched()) { #ifdef CONFIG_HOTPLUG_CPU @@ -213,7 +213,7 @@ void cpu_idle(void) } } leds_event(led_idle_end); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c index 6ee7952248dbe5..34c8c703bb161e 100644 --- a/arch/avr32/kernel/process.c +++ b/arch/avr32/kernel/process.c @@ -34,10 +34,10 @@ void cpu_idle(void) { /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) cpu_idle_sleep(); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c index 7b141b5c9e8d62..57e07498a0e76d 100644 --- a/arch/blackfin/kernel/process.c +++ b/arch/blackfin/kernel/process.c @@ -88,10 +88,10 @@ void cpu_idle(void) #endif if (!idle) idle = default_idle; - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) idle(); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index 5407f09b4be46c..13d59f34b94e15 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -103,10 +103,10 @@ void cpu_idle(void) if (!idle) idle = default_idle; - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) idle(); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index c11e5ca2a434a4..17fb3a270160d3 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -56,7 +56,7 @@ void __noreturn cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched() && cpu_online(cpu)) { #ifdef CONFIG_MIPS_MT_SMTC extern void smtc_idle_loop_hook(void); @@ -77,7 +77,7 @@ void __noreturn cpu_idle(void) system_state == SYSTEM_BOOTING)) play_dead(); #endif - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/openrisc/kernel/idle.c b/arch/openrisc/kernel/idle.c index fb6a9bf40006d1..2e82cd0fa5e193 100644 --- a/arch/openrisc/kernel/idle.c +++ b/arch/openrisc/kernel/idle.c @@ -51,7 +51,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) { check_pgt_cache(); @@ -69,7 +69,7 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); } - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index 878572f70ac5f5..2e782a36d8f2e3 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -56,7 +56,7 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched() && !cpu_should_die()) { ppc64_runlatch_off(); @@ -93,7 +93,7 @@ void cpu_idle(void) HMT_medium(); ppc64_runlatch_on(); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); if (cpu_should_die()) cpu_die(); diff --git a/arch/powerpc/platforms/iseries/setup.c b/arch/powerpc/platforms/iseries/setup.c index e83dfaf89f6978..d69d3d185e8928 100644 --- a/arch/powerpc/platforms/iseries/setup.c +++ b/arch/powerpc/platforms/iseries/setup.c @@ -563,7 +563,7 @@ static void yield_shared_processor(void) static void iseries_shared_idle(void) { while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched() && !hvlpevent_is_pending()) { local_irq_disable(); ppc64_runlatch_off(); @@ -577,7 +577,7 @@ static void iseries_shared_idle(void) } ppc64_runlatch_on(); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); if (hvlpevent_is_pending()) process_iSeries_events(); @@ -593,7 +593,7 @@ static void iseries_dedicated_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); if (!need_resched()) { while (!need_resched()) { ppc64_runlatch_off(); @@ -610,7 +610,7 @@ static void iseries_dedicated_idle(void) } ppc64_runlatch_on(); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 6224f9dbbc1f8c..6fa987367ae670 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -91,10 +91,10 @@ static void default_idle(void) void cpu_idle(void) { for (;;) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) default_idle(); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c index 6015743020a07a..ad58e7535a7c47 100644 --- a/arch/sh/kernel/idle.c +++ b/arch/sh/kernel/idle.c @@ -89,7 +89,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) { check_pgt_cache(); @@ -111,7 +111,7 @@ void cpu_idle(void) start_critical_timings(); } - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 9c2795ba2cfe54..4a0e7d79cb926e 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -95,12 +95,12 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while(1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched() && !cpu_is_offline(cpu)) sparc64_yield(cpu); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c index 920e674aedb946..53ac89595ab153 100644 --- a/arch/tile/kernel/process.c +++ b/arch/tile/kernel/process.c @@ -85,7 +85,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) { if (cpu_is_offline(cpu)) BUG(); /* no HOTPLUG_CPU */ @@ -105,7 +105,7 @@ void cpu_idle(void) local_irq_enable(); current_thread_info()->status |= TS_POLLING; } - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index cfb657e9284912..55d2cf455f630b 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -246,10 +246,10 @@ void default_idle(void) if (need_resched()) schedule(); - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); nsecs = disable_timer(); idle_sleep(nsecs); - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); } } diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c index 9999b9a84d4677..095ff5a57928fb 100644 --- a/arch/unicore32/kernel/process.c +++ b/arch/unicore32/kernel/process.c @@ -55,7 +55,7 @@ void cpu_idle(void) { /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) { local_irq_disable(); stop_critical_timings(); @@ -63,7 +63,7 @@ void cpu_idle(void) local_irq_enable(); start_critical_timings(); } - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 6d9d4d52cac5af..f94da3920c36bc 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -99,7 +99,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) { check_pgt_cache(); @@ -116,7 +116,7 @@ void cpu_idle(void) pm_idle(); start_critical_timings(); } - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b069e9d7875f7c..18e8cf3581f660 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -122,7 +122,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) { rmb(); @@ -149,7 +149,7 @@ void cpu_idle(void) __exit_idle(); } - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/include/linux/tick.h b/include/linux/tick.h index 0df1d50a408ae1..327434a0575725 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -7,6 +7,7 @@ #define _LINUX_TICK_H #include +#include #ifdef CONFIG_GENERIC_CLOCKEVENTS @@ -121,18 +122,57 @@ static inline int tick_oneshot_mode_active(void) { return 0; } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ # ifdef CONFIG_NO_HZ -extern void tick_nohz_idle_enter(void); +extern void __tick_nohz_idle_enter(void); +static inline void tick_nohz_idle_enter(void) +{ + local_irq_disable(); + __tick_nohz_idle_enter(); + local_irq_enable(); +} extern void tick_nohz_idle_exit(void); + +/* + * Call this pair of function if the arch doesn't make any use + * of RCU in-between. You won't need to call rcu_idle_enter() and + * rcu_idle_exit(). + * Otherwise you need to call tick_nohz_idle_enter() and tick_nohz_idle_exit() + * and explicitly tell RCU about the window around the place the CPU enters low + * power mode where no RCU use is made. This is done by calling rcu_idle_enter() + * after the last use of RCU before the CPU is put to sleep and by calling + * rcu_idle_exit() before the first use of RCU after the CPU woke up. + */ +static inline void tick_nohz_idle_enter_norcu(void) +{ + /* + * Also call rcu_idle_enter() in the irq disabled section even + * if it disables irq itself. + * Just an optimization that prevents from an interrupt happening + * between it and __tick_nohz_idle_enter() to lose time to help + * completing a grace period while we could be in extended grace + * period already. + */ + local_irq_disable(); + __tick_nohz_idle_enter(); + rcu_idle_enter(); + local_irq_enable(); +} +static inline void tick_nohz_idle_exit_norcu(void) +{ + rcu_idle_exit(); + tick_nohz_idle_exit(); +} extern void tick_nohz_irq_exit(void); extern ktime_t tick_nohz_get_sleep_length(void); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); # else -static inline void tick_nohz_idle_enter(void) +static inline void tick_nohz_idle_enter(void) { } +static inline void tick_nohz_idle_exit(void) { } +static inline void tick_nohz_idle_enter_norcu(void) { rcu_idle_enter(); } -static inline void tick_nohz_idle_exit(void) +static inline void tick_nohz_idle_exit_norcu(void) { rcu_idle_exit(); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 266c242dc354dd..c76aefe764b0f1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -453,18 +453,22 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) * * When the next event is more than a tick into the future, stop the idle tick * Called when we start the idle loop. - * This also enters into RCU extended quiescent state so that this CPU doesn't - * need anymore to be part of any global grace period completion. This way - * the tick can be stopped safely as we don't need to report quiescent states. + * + * If no use of RCU is made in the idle loop between + * tick_nohz_idle_enter() and tick_nohz_idle_exit() calls, then + * tick_nohz_idle_enter_norcu() should be called instead and the arch + * doesn't need to call rcu_idle_enter() and rcu_idle_exit() explicitly. + * + * Otherwise the arch is responsible of calling: + * + * - rcu_idle_enter() after its last use of RCU before the CPU is put + * to sleep. + * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. */ -void tick_nohz_idle_enter(void) +void __tick_nohz_idle_enter(void) { struct tick_sched *ts; - WARN_ON_ONCE(irqs_disabled()); - - local_irq_disable(); - ts = &__get_cpu_var(tick_cpu_sched); /* * set ts->inidle unconditionally. even if the system did not @@ -473,9 +477,6 @@ void tick_nohz_idle_enter(void) */ ts->inidle = 1; tick_nohz_stop_sched_tick(ts); - rcu_idle_enter(); - - local_irq_enable(); } /** @@ -551,7 +552,7 @@ void tick_nohz_idle_exit(void) ktime_t now; local_irq_disable(); - rcu_idle_exit(); + if (ts->idle_active || (ts->inidle && ts->tick_stopped)) now = ktime_get(); From e37e112de3ac64032df45c2db0dbe1e8f1af86b4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Oct 2011 18:22:08 +0200 Subject: [PATCH 20/64] x86: Enter rcu extended qs after idle notifier call The idle notifier, called by enter_idle(), enters into rcu read side critical section but at that time we already switched into the RCU-idle window (rcu_idle_enter() has been called). And it's illegal to use rcu_read_lock() in that state. This results in rcu reporting its bad mood: [ 1.275635] WARNING: at include/linux/rcupdate.h:194 __atomic_notifier_call_chain+0xd2/0x110() [ 1.275635] Hardware name: AMD690VM-FMH [ 1.275635] Modules linked in: [ 1.275635] Pid: 0, comm: swapper Not tainted 3.0.0-rc6+ #252 [ 1.275635] Call Trace: [ 1.275635] [] warn_slowpath_common+0x7a/0xb0 [ 1.275635] [] warn_slowpath_null+0x15/0x20 [ 1.275635] [] __atomic_notifier_call_chain+0xd2/0x110 [ 1.275635] [] atomic_notifier_call_chain+0x11/0x20 [ 1.275635] [] enter_idle+0x20/0x30 [ 1.275635] [] cpu_idle+0xa5/0x110 [ 1.275635] [] rest_init+0xe5/0x140 [ 1.275635] [] ? rest_init+0x48/0x140 [ 1.275635] [] start_kernel+0x3d1/0x3dc [ 1.275635] [] x86_64_start_reservations+0x131/0x135 [ 1.275635] [] x86_64_start_kernel+0xed/0xf4 [ 1.275635] ---[ end trace a22d306b065d4a66 ]--- Fix this by entering rcu extended quiescent state later, just before the CPU goes to sleep. Signed-off-by: Frederic Weisbecker Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- arch/x86/kernel/process_64.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 18e8cf3581f660..64e926c89a6fcd 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -122,7 +122,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter_norcu(); + tick_nohz_idle_enter(); while (!need_resched()) { rmb(); @@ -139,8 +139,14 @@ void cpu_idle(void) enter_idle(); /* Don't trace irqs off for idle */ stop_critical_timings(); + + /* enter_idle() needs rcu for notifiers */ + rcu_idle_enter(); + if (cpuidle_idle_call()) pm_idle(); + + rcu_idle_exit(); start_critical_timings(); /* In many cases the interrupt that ended idle @@ -149,7 +155,7 @@ void cpu_idle(void) __exit_idle(); } - tick_nohz_idle_exit_norcu(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); From 98ad1cc14a5c4fd658f9d72c6ba5c86dfd3ce0d5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Oct 2011 18:22:09 +0200 Subject: [PATCH 21/64] x86: Call idle notifier after irq_enter() Interrupts notify the idle exit state before calling irq_enter(). But the notifier code calls rcu_read_lock() and this is not allowed while rcu is in an extended quiescent state. We need to wait for irq_enter() -> rcu_idle_exit() to be called before doing so otherwise this results in a grumpy RCU: [ 0.099991] WARNING: at include/linux/rcupdate.h:194 __atomic_notifier_call_chain+0xd2/0x110() [ 0.099991] Hardware name: AMD690VM-FMH [ 0.099991] Modules linked in: [ 0.099991] Pid: 0, comm: swapper Not tainted 3.0.0-rc6+ #255 [ 0.099991] Call Trace: [ 0.099991] [] warn_slowpath_common+0x7a/0xb0 [ 0.099991] [] warn_slowpath_null+0x15/0x20 [ 0.099991] [] __atomic_notifier_call_chain+0xd2/0x110 [ 0.099991] [] atomic_notifier_call_chain+0x11/0x20 [ 0.099991] [] exit_idle+0x43/0x50 [ 0.099991] [] smp_apic_timer_interrupt+0x39/0xa0 [ 0.099991] [] apic_timer_interrupt+0x13/0x20 [ 0.099991] [] ? default_idle+0xa7/0x350 [ 0.099991] [] ? default_idle+0xa5/0x350 [ 0.099991] [] amd_e400_idle+0x8b/0x110 [ 0.099991] [] ? rcu_enter_nohz+0x8f/0x160 [ 0.099991] [] cpu_idle+0xb0/0x110 [ 0.099991] [] rest_init+0xe5/0x140 [ 0.099991] [] ? rest_init+0x48/0x140 [ 0.099991] [] start_kernel+0x3d1/0x3dc [ 0.099991] [] x86_64_start_reservations+0x131/0x135 [ 0.099991] [] x86_64_start_kernel+0xed/0xf4 Signed-off-by: Frederic Weisbecker Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Andy Henroid Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- arch/x86/kernel/apic/apic.c | 6 +++--- arch/x86/kernel/apic/io_apic.c | 2 +- arch/x86/kernel/cpu/mcheck/therm_throt.c | 2 +- arch/x86/kernel/cpu/mcheck/threshold.c | 2 +- arch/x86/kernel/irq.c | 6 +++--- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f98d84caf94cfd..2cd2d93643dcde 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -876,8 +876,8 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) * Besides, if we don't timer interrupts ignore the global * interrupt lock, which is the WrongThing (tm) to do. */ - exit_idle(); irq_enter(); + exit_idle(); local_apic_timer_interrupt(); irq_exit(); @@ -1809,8 +1809,8 @@ void smp_spurious_interrupt(struct pt_regs *regs) { u32 v; - exit_idle(); irq_enter(); + exit_idle(); /* * Check if this really is a spurious interrupt and ACK it * if it is a vectored one. Just in case... @@ -1846,8 +1846,8 @@ void smp_error_interrupt(struct pt_regs *regs) "Illegal register address", /* APIC Error Bit 7 */ }; - exit_idle(); irq_enter(); + exit_idle(); /* First tickle the hardware, only then report what went on. -- REW */ v0 = apic_read(APIC_ESR); apic_write(APIC_ESR, 0); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 6d939d7847e293..89805558551649 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2421,8 +2421,8 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) unsigned vector, me; ack_APIC_irq(); - exit_idle(); irq_enter(); + exit_idle(); me = smp_processor_id(); for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 787e06c84ea6a8..ce215616d5b979 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -397,8 +397,8 @@ static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) { - exit_idle(); irq_enter(); + exit_idle(); inc_irq_stat(irq_thermal_count); smp_thermal_vector(); irq_exit(); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index d746df2909c9fe..aa578cadb9407d 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -19,8 +19,8 @@ void (*mce_threshold_vector)(void) = default_threshold_interrupt; asmlinkage void smp_threshold_interrupt(void) { - exit_idle(); irq_enter(); + exit_idle(); inc_irq_stat(irq_threshold_count); mce_threshold_vector(); irq_exit(); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 429e0c92924eed..5d31e5bdbf852b 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -181,8 +181,8 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) unsigned vector = ~regs->orig_ax; unsigned irq; - exit_idle(); irq_enter(); + exit_idle(); irq = __this_cpu_read(vector_irq[vector]); @@ -209,10 +209,10 @@ void smp_x86_platform_ipi(struct pt_regs *regs) ack_APIC_irq(); - exit_idle(); - irq_enter(); + exit_idle(); + inc_irq_stat(x86_platform_ipis); if (x86_platform_ipi_callback) From 416eb33cd60ef405e2860a186364e57bcb2d89f6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Oct 2011 16:31:02 -0700 Subject: [PATCH 22/64] rcu: Fix early call to rcu_idle_enter() On the irq exit path, tick_nohz_irq_exit() may raise a softirq, which action leads to the wake up path and select_task_rq_fair() that makes use of rcu to iterate the domains. This is an illegal use of RCU because we may be in RCU extended quiescent state if we interrupted an RCU-idle window in the idle loop: [ 132.978883] =============================== [ 132.978883] [ INFO: suspicious RCU usage. ] [ 132.978883] ------------------------------- [ 132.978883] kernel/sched_fair.c:1707 suspicious rcu_dereference_check() usage! [ 132.978883] [ 132.978883] other info that might help us debug this: [ 132.978883] [ 132.978883] [ 132.978883] rcu_scheduler_active = 1, debug_locks = 0 [ 132.978883] RCU used illegally from extended quiescent state! [ 132.978883] 2 locks held by swapper/0: [ 132.978883] #0: (&p->pi_lock){-.-.-.}, at: [] try_to_wake_up+0x39/0x2f0 [ 132.978883] #1: (rcu_read_lock){.+.+..}, at: [] select_task_rq_fair+0x6a/0xec0 [ 132.978883] [ 132.978883] stack backtrace: [ 132.978883] Pid: 0, comm: swapper Tainted: G W 3.0.0+ #178 [ 132.978883] Call Trace: [ 132.978883] [] lockdep_rcu_suspicious+0xe6/0x100 [ 132.978883] [] select_task_rq_fair+0x749/0xec0 [ 132.978883] [] ? select_task_rq_fair+0x6a/0xec0 [ 132.978883] [] ? do_raw_spin_lock+0x54/0x150 [ 132.978883] [] ? trace_hardirqs_on+0xd/0x10 [ 132.978883] [] try_to_wake_up+0xd3/0x2f0 [ 132.978883] [] ? ktime_get+0x68/0xf0 [ 132.978883] [] wake_up_process+0x15/0x20 [ 132.978883] [] raise_softirq_irqoff+0x65/0x110 [ 132.978883] [] __hrtimer_start_range_ns+0x415/0x5a0 [ 132.978883] [] ? do_raw_spin_unlock+0x5e/0xb0 [ 132.978883] [] hrtimer_start+0x18/0x20 [ 132.978883] [] tick_nohz_stop_sched_tick+0x393/0x450 [ 132.978883] [] irq_exit+0xd2/0x100 [ 132.978883] [] do_IRQ+0x66/0xe0 [ 132.978883] [] common_interrupt+0x13/0x13 [ 132.978883] [] ? native_safe_halt+0xb/0x10 [ 132.978883] [] ? trace_hardirqs_on+0xd/0x10 [ 132.978883] [] default_idle+0xba/0x370 [ 132.978883] [] amd_e400_idle+0x5e/0x130 [ 132.978883] [] cpu_idle+0xb6/0x120 [ 132.978883] [] rest_init+0xef/0x150 [ 132.978883] [] ? rest_init+0x52/0x150 [ 132.978883] [] start_kernel+0x3da/0x3e5 [ 132.978883] [] x86_64_start_reservations+0x131/0x135 [ 132.978883] [] x86_64_start_kernel+0x103/0x112 Fix this by calling rcu_idle_enter() after tick_nohz_irq_exit(). Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/softirq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/softirq.c b/kernel/softirq.c index f9f2aa81ce53af..4eb3a0fa351e78 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -347,12 +347,12 @@ void irq_exit(void) if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); - rcu_irq_exit(); #ifdef CONFIG_NO_HZ /* Make sure that timer wheel updates are propagated */ if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) tick_nohz_irq_exit(); #endif + rcu_irq_exit(); preempt_enable_no_resched(); } From a7b152d5342c06e81ab0cf7d18345f69fa7e56b5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 13 Oct 2011 19:05:12 -0700 Subject: [PATCH 23/64] powerpc: Tell RCU about idle after hcall tracing The PowerPC pSeries platform (CONFIG_PPC_PSERIES=y) enables hypervisor-call tracing for CONFIG_TRACEPOINTS=y kernels. One of the hypervisor calls that is traced is the H_CEDE call in the idle loop that tells the hypervisor that this OS instance no longer needs the current CPU. However, tracing uses RCU, so this combination of kernel configuration variables needs to avoid telling RCU about the current CPU's idleness until after the H_CEDE-entry tracing completes on the one hand, and must tell RCU that the the current CPU is no longer idle before the H_CEDE-exit tracing starts. In all other cases, it suffices to inform RCU of CPU idleness upon idle-loop entry and exit. This commit makes the required adjustments. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- arch/powerpc/kernel/idle.c | 16 ++++++++++++++-- arch/powerpc/platforms/pseries/lpar.c | 4 ++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index 2e782a36d8f2e3..3cd73d1fc427b3 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -46,6 +46,12 @@ static int __init powersave_off(char *arg) } __setup("powersave=off", powersave_off); +#if defined(CONFIG_PPC_PSERIES) && defined(CONFIG_TRACEPOINTS) +static const bool idle_uses_rcu = 1; +#else +static const bool idle_uses_rcu; +#endif + /* * The body of the idle task. */ @@ -56,7 +62,10 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while (1) { - tick_nohz_idle_enter_norcu(); + if (idle_uses_rcu) + tick_nohz_idle_enter(); + else + tick_nohz_idle_enter_norcu(); while (!need_resched() && !cpu_should_die()) { ppc64_runlatch_off(); @@ -93,7 +102,10 @@ void cpu_idle(void) HMT_medium(); ppc64_runlatch_on(); - tick_nohz_idle_exit_norcu(); + if (idle_uses_rcu) + tick_nohz_idle_exit(); + else + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); if (cpu_should_die()) cpu_die(); diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 27a49508b41071..52d429be6c764a 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -555,6 +555,8 @@ void __trace_hcall_entry(unsigned long opcode, unsigned long *args) (*depth)++; trace_hcall_entry(opcode, args); + if (opcode == H_CEDE) + rcu_idle_enter(); (*depth)--; out: @@ -575,6 +577,8 @@ void __trace_hcall_exit(long opcode, unsigned long retval, goto out; (*depth)++; + if (opcode == H_CEDE) + rcu_idle_exit(); trace_hcall_exit(opcode, retval, retbuf); (*depth)--; From 0c53dd8b31404c1d7fd15be8f065ebaec615a562 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 9 Oct 2011 15:13:11 -0700 Subject: [PATCH 24/64] rcu: Introduce raw SRCU read-side primitives The RCU implementations, including SRCU, are designed to be used in a lock-like fashion, so that the read-side lock and unlock primitives must execute in the same context for any given read-side critical section. This constraint is enforced by lockdep-RCU. However, there is a need to enter an SRCU read-side critical section within the context of an exception and then exit in the context of the task that encountered the exception. The cost of this capability is that the read-side operations incur the overhead of disabling interrupts. Note that although the current implementation allows a given read-side critical section to be entered by one task and then exited by another, all known possible implementations that allow this have scalability problems. Therefore, a given read-side critical section must be exited by the same task that entered it, though perhaps from an interrupt or exception handler running within that task's context. But if you are thinking in terms of interrupt handlers, make sure that you have considered the possibility of threaded interrupt handlers. Credit goes to Peter Zijlstra for suggesting use of the existing _raw suffix to indicate disabling lockdep over the earlier "bulkref" names. Requested-by: Srikar Dronamraju Signed-off-by: Paul E. McKenney Tested-by: Srikar Dronamraju --- include/linux/srcu.h | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index d4b12443b2efd8..1eb520cd1680db 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -181,4 +181,47 @@ static inline void srcu_read_unlock(struct srcu_struct *sp, int idx) __srcu_read_unlock(sp, idx); } +/** + * srcu_read_lock_raw - register a new reader for an SRCU-protected structure. + * @sp: srcu_struct in which to register the new reader. + * + * Enter an SRCU read-side critical section. Similar to srcu_read_lock(), + * but avoids the RCU-lockdep checking. This means that it is legal to + * use srcu_read_lock_raw() in one context, for example, in an exception + * handler, and then have the matching srcu_read_unlock_raw() in another + * context, for example in the task that took the exception. + * + * However, the entire SRCU read-side critical section must reside within a + * single task. For example, beware of using srcu_read_lock_raw() in + * a device interrupt handler and srcu_read_unlock() in the interrupted + * task: This will not work if interrupts are threaded. + */ +static inline int srcu_read_lock_raw(struct srcu_struct *sp) +{ + unsigned long flags; + int ret; + + local_irq_save(flags); + ret = __srcu_read_lock(sp); + local_irq_restore(flags); + return ret; +} + +/** + * srcu_read_unlock_raw - unregister reader from an SRCU-protected structure. + * @sp: srcu_struct in which to unregister the old reader. + * @idx: return value from corresponding srcu_read_lock_raw(). + * + * Exit an SRCU read-side critical section without lockdep-RCU checking. + * See srcu_read_lock_raw() for more details. + */ +static inline void srcu_read_unlock_raw(struct srcu_struct *sp, int idx) +{ + unsigned long flags; + + local_irq_save(flags); + __srcu_read_unlock(sp, idx); + local_irq_restore(flags); +} + #endif From 9ceae0e248fb553c702d51d5275167d462f4efd2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 3 Nov 2011 13:43:24 -0700 Subject: [PATCH 25/64] rcu: Add documentation for raw SRCU read-side primitives Update various files in Documentation/RCU to reflect srcu_read_lock_raw() and srcu_read_unlock_raw(). Credit to Peter Zijlstra for suggesting use of the existing _raw suffix instead of the earlier bulkref names. Signed-off-by: Paul E. McKenney --- Documentation/RCU/checklist.txt | 6 ++++++ Documentation/RCU/rcu.txt | 10 +++++----- Documentation/RCU/stallwarn.txt | 11 +++++------ Documentation/RCU/whatisRCU.txt | 18 +++++++++++++----- 4 files changed, 29 insertions(+), 16 deletions(-) diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index 0c134f8afc6f60..bff2d8be1e18ca 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt @@ -328,6 +328,12 @@ over a rather long period of time, but improvements are always welcome! RCU rather than SRCU, because RCU is almost always faster and easier to use than is SRCU. + If you need to enter your read-side critical section in a + hardirq or exception handler, and then exit that same read-side + critical section in the task that was interrupted, then you need + to srcu_read_lock_raw() and srcu_read_unlock_raw(), which avoid + the lockdep checking that would otherwise this practice illegal. + Also unlike other forms of RCU, explicit initialization and cleanup is required via init_srcu_struct() and cleanup_srcu_struct(). These are passed a "struct srcu_struct" diff --git a/Documentation/RCU/rcu.txt b/Documentation/RCU/rcu.txt index 31852705b58630..bf778332a28f8e 100644 --- a/Documentation/RCU/rcu.txt +++ b/Documentation/RCU/rcu.txt @@ -38,11 +38,11 @@ o How can the updater tell when a grace period has completed Preemptible variants of RCU (CONFIG_TREE_PREEMPT_RCU) get the same effect, but require that the readers manipulate CPU-local - counters. These counters allow limited types of blocking - within RCU read-side critical sections. SRCU also uses - CPU-local counters, and permits general blocking within - RCU read-side critical sections. These two variants of - RCU detect grace periods by sampling these counters. + counters. These counters allow limited types of blocking within + RCU read-side critical sections. SRCU also uses CPU-local + counters, and permits general blocking within RCU read-side + critical sections. These variants of RCU detect grace periods + by sampling these counters. o If I am running on a uniprocessor kernel, which can only do one thing at a time, why should I wait for a grace period? diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index f3e0625f4290dd..083d88cbc0899f 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -114,12 +114,11 @@ o A hardware failure. This is quite unlikely, but has occurred This resulted in a series of RCU CPU stall warnings, eventually leading the realization that the CPU had failed. -The RCU, RCU-sched, and RCU-bh implementations have CPU stall -warning. SRCU does not have its own CPU stall warnings, but its -calls to synchronize_sched() will result in RCU-sched detecting -RCU-sched-related CPU stalls. Please note that RCU only detects -CPU stalls when there is a grace period in progress. No grace period, -no CPU stall warnings. +The RCU, RCU-sched, and RCU-bh implementations have CPU stall warning. +SRCU does not have its own CPU stall warnings, but its calls to +synchronize_sched() will result in RCU-sched detecting RCU-sched-related +CPU stalls. Please note that RCU only detects CPU stalls when there is +a grace period in progress. No grace period, no CPU stall warnings. To diagnose the cause of the stall, inspect the stack traces. The offending function will usually be near the top of the stack. diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 6ef692667e2f67..8e8cdc2430b9ba 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -834,6 +834,8 @@ SRCU: Critical sections Grace period Barrier srcu_read_lock synchronize_srcu N/A srcu_read_unlock synchronize_srcu_expedited + srcu_read_lock_raw + srcu_read_unlock_raw srcu_dereference SRCU: Initialization/cleanup @@ -855,27 +857,33 @@ list can be helpful: a. Will readers need to block? If so, you need SRCU. -b. What about the -rt patchset? If readers would need to block +b. Is it necessary to start a read-side critical section in a + hardirq handler or exception handler, and then to complete + this read-side critical section in the task that was + interrupted? If so, you need SRCU's srcu_read_lock_raw() and + srcu_read_unlock_raw() primitives. + +c. What about the -rt patchset? If readers would need to block in an non-rt kernel, you need SRCU. If readers would block in a -rt kernel, but not in a non-rt kernel, SRCU is not necessary. -c. Do you need to treat NMI handlers, hardirq handlers, +d. Do you need to treat NMI handlers, hardirq handlers, and code segments with preemption disabled (whether via preempt_disable(), local_irq_save(), local_bh_disable(), or some other mechanism) as if they were explicit RCU readers? If so, you need RCU-sched. -d. Do you need RCU grace periods to complete even in the face +e. Do you need RCU grace periods to complete even in the face of softirq monopolization of one or more of the CPUs? For example, is your code subject to network-based denial-of-service attacks? If so, you need RCU-bh. -e. Is your workload too update-intensive for normal use of +f. Is your workload too update-intensive for normal use of RCU, but inappropriate for other synchronization mechanisms? If so, consider SLAB_DESTROY_BY_RCU. But please be careful! -f. Otherwise, use RCU. +g. Otherwise, use RCU. Of course, this all assumes that you have determined that RCU is in fact the right tool for your job. From 4145fa7fbee3ec1e61c52825b146192885d9759f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 31 Oct 2011 15:01:54 -0700 Subject: [PATCH 26/64] rcu: Deconfuse dynticks entry-exit tracing The trace_rcu_dyntick() trace event did not print both the old and the new value of the nesting level, and furthermore printed only the low-order 32 bits of it. This could result in some confusion when interpreting trace-event dumps, so this commit prints both the old and the new value, prints the full 64 bits, and also selects the process-entry/exit increment to print nicely in hexadecimal. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/trace/events/rcu.h | 15 +++++++++------ kernel/rcu.h | 7 +++++++ kernel/rcutiny.c | 28 +++++++++++++++++----------- kernel/rcutree.c | 35 ++++++++++++++++++++--------------- 4 files changed, 53 insertions(+), 32 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 172620a92b1a86..c29fb2f5590907 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -246,21 +246,24 @@ TRACE_EVENT(rcu_fqs, */ TRACE_EVENT(rcu_dyntick, - TP_PROTO(char *polarity, int nesting), + TP_PROTO(char *polarity, long long oldnesting, long long newnesting), - TP_ARGS(polarity, nesting), + TP_ARGS(polarity, oldnesting, newnesting), TP_STRUCT__entry( __field(char *, polarity) - __field(int, nesting) + __field(long long, oldnesting) + __field(long long, newnesting) ), TP_fast_assign( __entry->polarity = polarity; - __entry->nesting = nesting; + __entry->oldnesting = oldnesting; + __entry->newnesting = newnesting; ), - TP_printk("%s %d", __entry->polarity, __entry->nesting) + TP_printk("%s %llx %llx", __entry->polarity, + __entry->oldnesting, __entry->newnesting) ); /* @@ -470,7 +473,7 @@ TRACE_EVENT(rcu_torture_read, #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0) #define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks) do { } while (0) #define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0) -#define trace_rcu_dyntick(polarity, nesting) do { } while (0) +#define trace_rcu_dyntick(polarity, oldnesting, newnesting) do { } while (0) #define trace_rcu_callback(rcuname, rhp, qlen) do { } while (0) #define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen) do { } while (0) #define trace_rcu_batch_start(rcuname, qlen, blimit) do { } while (0) diff --git a/kernel/rcu.h b/kernel/rcu.h index f600868d550dc1..aa88baab5f78a7 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h @@ -29,6 +29,13 @@ #define RCU_TRACE(stmt) #endif /* #else #ifdef CONFIG_RCU_TRACE */ +/* + * Process-level increment to ->dynticks_nesting field. This allows for + * architectures that use half-interrupts and half-exceptions from + * process context. + */ +#define DYNTICK_TASK_NESTING (LLONG_MAX / 2 - 1) + /* * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally * by call_rcu() and rcu callback execution, and are therefore not part of the diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index b4e0b498176869..9b9bdf666fb502 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -53,20 +53,21 @@ static void __call_rcu(struct rcu_head *head, #include "rcutiny_plugin.h" -static long long rcu_dynticks_nesting = LLONG_MAX / 2; +static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING; /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ -static void rcu_idle_enter_common(void) +static void rcu_idle_enter_common(long long oldval) { if (rcu_dynticks_nesting) { - RCU_TRACE(trace_rcu_dyntick("--=", rcu_dynticks_nesting)); + RCU_TRACE(trace_rcu_dyntick("--=", + oldval, rcu_dynticks_nesting)); return; } - RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting)); + RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting)); if (!idle_cpu(smp_processor_id())) { WARN_ON_ONCE(1); /* must be idle task! */ RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", - rcu_dynticks_nesting)); + oldval, rcu_dynticks_nesting)); ftrace_dump(DUMP_ALL); } rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ @@ -79,10 +80,12 @@ static void rcu_idle_enter_common(void) void rcu_idle_enter(void) { unsigned long flags; + long long oldval; local_irq_save(flags); + oldval = rcu_dynticks_nesting; rcu_dynticks_nesting = 0; - rcu_idle_enter_common(); + rcu_idle_enter_common(oldval); local_irq_restore(flags); } @@ -92,11 +95,13 @@ void rcu_idle_enter(void) void rcu_irq_exit(void) { unsigned long flags; + long long oldval; local_irq_save(flags); + oldval = rcu_dynticks_nesting; rcu_dynticks_nesting--; WARN_ON_ONCE(rcu_dynticks_nesting < 0); - rcu_idle_enter_common(); + rcu_idle_enter_common(oldval); local_irq_restore(flags); } @@ -104,14 +109,15 @@ void rcu_irq_exit(void) static void rcu_idle_exit_common(long long oldval) { if (oldval) { - RCU_TRACE(trace_rcu_dyntick("++=", rcu_dynticks_nesting)); + RCU_TRACE(trace_rcu_dyntick("++=", + oldval, rcu_dynticks_nesting)); return; } - RCU_TRACE(trace_rcu_dyntick("End", oldval)); + RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting)); if (!idle_cpu(smp_processor_id())) { WARN_ON_ONCE(1); /* must be idle task! */ RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", - oldval)); + oldval, rcu_dynticks_nesting)); ftrace_dump(DUMP_ALL); } } @@ -127,7 +133,7 @@ void rcu_idle_exit(void) local_irq_save(flags); oldval = rcu_dynticks_nesting; WARN_ON_ONCE(oldval != 0); - rcu_dynticks_nesting = LLONG_MAX / 2; + rcu_dynticks_nesting = DYNTICK_TASK_NESTING; rcu_idle_exit_common(oldval); local_irq_restore(flags); } diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 489b62a67d359a..06e40dd53b2384 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -196,7 +196,7 @@ void rcu_note_context_switch(int cpu) EXPORT_SYMBOL_GPL(rcu_note_context_switch); DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { - .dynticks_nesting = LLONG_MAX / 2, + .dynticks_nesting = DYNTICK_TASK_NESTING, .dynticks = ATOMIC_INIT(1), }; @@ -348,17 +348,17 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) * we really have entered idle, and must do the appropriate accounting. * The caller must have disabled interrupts. */ -static void rcu_idle_enter_common(struct rcu_dynticks *rdtp) +static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) { if (rdtp->dynticks_nesting) { - trace_rcu_dyntick("--=", rdtp->dynticks_nesting); + trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); return; } - trace_rcu_dyntick("Start", rdtp->dynticks_nesting); + trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); if (!idle_cpu(smp_processor_id())) { WARN_ON_ONCE(1); /* must be idle task! */ trace_rcu_dyntick("Error on entry: not idle task", - rdtp->dynticks_nesting); + oldval, rdtp->dynticks_nesting); ftrace_dump(DUMP_ALL); } /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ @@ -383,12 +383,14 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp) void rcu_idle_enter(void) { unsigned long flags; + long long oldval; struct rcu_dynticks *rdtp; local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); + oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting = 0; - rcu_idle_enter_common(rdtp); + rcu_idle_enter_common(rdtp, oldval); local_irq_restore(flags); } @@ -411,13 +413,15 @@ void rcu_idle_enter(void) void rcu_irq_exit(void) { unsigned long flags; + long long oldval; struct rcu_dynticks *rdtp; local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); + oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting--; WARN_ON_ONCE(rdtp->dynticks_nesting < 0); - rcu_idle_enter_common(rdtp); + rcu_idle_enter_common(rdtp, oldval); local_irq_restore(flags); } @@ -431,7 +435,7 @@ void rcu_irq_exit(void) static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) { if (oldval) { - trace_rcu_dyntick("++=", rdtp->dynticks_nesting); + trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); return; } smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ @@ -439,10 +443,11 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ smp_mb__after_atomic_inc(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); - trace_rcu_dyntick("End", oldval); + trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); if (!idle_cpu(smp_processor_id())) { WARN_ON_ONCE(1); /* must be idle task! */ - trace_rcu_dyntick("Error on exit: not idle task", oldval); + trace_rcu_dyntick("Error on exit: not idle task", + oldval, rdtp->dynticks_nesting); ftrace_dump(DUMP_ALL); } } @@ -453,8 +458,8 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) * Exit idle mode, in other words, -enter- the mode in which RCU * read-side critical sections can occur. * - * We crowbar the ->dynticks_nesting field to LLONG_MAX/2 to allow for - * the possibility of usermode upcalls messing up our count + * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to + * allow for the possibility of usermode upcalls messing up our count * of interrupt nesting level during the busy period that is just * now starting. */ @@ -468,7 +473,7 @@ void rcu_idle_exit(void) rdtp = &__get_cpu_var(rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(oldval != 0); - rdtp->dynticks_nesting = LLONG_MAX / 2; + rdtp->dynticks_nesting = DYNTICK_TASK_NESTING; rcu_idle_exit_common(rdtp, oldval); local_irq_restore(flags); } @@ -2012,7 +2017,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->nxttail[i] = &rdp->nxtlist; rdp->qlen = 0; rdp->dynticks = &per_cpu(rcu_dynticks, cpu); - WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != LLONG_MAX / 2); + WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); rdp->cpu = cpu; rdp->rsp = rsp; @@ -2040,7 +2045,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; - WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != LLONG_MAX / 2); + WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING); WARN_ON_ONCE((atomic_read(&rdp->dynticks->dynticks) & 0x1) != 1); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ From 0989cb46783188ea7346ba6490be0046b9b7a725 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Nov 2011 08:57:21 -0700 Subject: [PATCH 27/64] rcu: Add more information to the wrong-idle-task complaint The current code just complains if the current task is not the idle task. This commit therefore adds printing of the identity of the idle task. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutiny.c | 12 ++++++++++-- kernel/rcutree.c | 12 ++++++++++-- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 9b9bdf666fb502..6d70ff71a875cb 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -65,10 +65,14 @@ static void rcu_idle_enter_common(long long oldval) } RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting)); if (!idle_cpu(smp_processor_id())) { - WARN_ON_ONCE(1); /* must be idle task! */ + struct task_struct *idle = idle_task(smp_processor_id()); + RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", oldval, rcu_dynticks_nesting)); ftrace_dump(DUMP_ALL); + WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", + current->pid, current->comm, + idle->pid, idle->comm); /* must be idle task! */ } rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ } @@ -115,10 +119,14 @@ static void rcu_idle_exit_common(long long oldval) } RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting)); if (!idle_cpu(smp_processor_id())) { - WARN_ON_ONCE(1); /* must be idle task! */ + struct task_struct *idle = idle_task(smp_processor_id()); + RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", oldval, rcu_dynticks_nesting)); ftrace_dump(DUMP_ALL); + WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", + current->pid, current->comm, + idle->pid, idle->comm); /* must be idle task! */ } } diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 06e40dd53b2384..9888a0ad2d4e4d 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -356,10 +356,14 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) } trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); if (!idle_cpu(smp_processor_id())) { - WARN_ON_ONCE(1); /* must be idle task! */ + struct task_struct *idle = idle_task(smp_processor_id()); + trace_rcu_dyntick("Error on entry: not idle task", oldval, rdtp->dynticks_nesting); ftrace_dump(DUMP_ALL); + WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", + current->pid, current->comm, + idle->pid, idle->comm); /* must be idle task! */ } /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ smp_mb__before_atomic_inc(); /* See above. */ @@ -445,10 +449,14 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); if (!idle_cpu(smp_processor_id())) { - WARN_ON_ONCE(1); /* must be idle task! */ + struct task_struct *idle = idle_task(smp_processor_id()); + trace_rcu_dyntick("Error on exit: not idle task", oldval, rdtp->dynticks_nesting); ftrace_dump(DUMP_ALL); + WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", + current->pid, current->comm, + idle->pid, idle->comm); /* must be idle task! */ } } From aea1b35e29e658d42d7ba2237f3aa7f93e18509d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Nov 2011 06:54:54 -0700 Subject: [PATCH 28/64] rcu: Allow dyntick-idle mode for CPUs with callbacks Currently, RCU does not permit a CPU to enter dyntick-idle mode if that CPU has any RCU callbacks queued. This means that workloads for which each CPU wakes up and does some RCU updates every few ticks will never enter dyntick-idle mode. This can result in significant unnecessary power consumption, so this patch permits a given to enter dyntick-idle mode if it has callbacks, but only if that same CPU has completed all current work for the RCU core. We determine use rcu_pending() to determine whether a given CPU has completed all current work for the RCU core. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 5 +- kernel/rcutree.h | 4 ++ kernel/rcutree_plugin.h | 156 +++++++++++++++++++++++++++++++--------- 3 files changed, 132 insertions(+), 33 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 9888a0ad2d4e4d..b1711c48a7ec05 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -365,6 +365,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ } + rcu_prepare_for_idle(smp_processor_id()); /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ smp_mb__before_atomic_inc(); /* See above. */ atomic_inc(&rdtp->dynticks); @@ -1085,6 +1086,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) * callbacks are waiting on the grace period that just now * completed. */ + rcu_schedule_wake_gp_end(); if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ @@ -1670,6 +1672,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) &__get_cpu_var(rcu_sched_data)); __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); rcu_preempt_process_callbacks(); + rcu_wake_cpus_for_gp_end(); trace_rcu_utilization("End RCU core"); } @@ -1923,7 +1926,7 @@ static int rcu_pending(int cpu) * by the current CPU, even if none need be done immediately, returning * 1 if so. */ -static int rcu_needs_cpu_quick_check(int cpu) +static int rcu_cpu_has_callbacks(int cpu) { /* RCU callbacks either ready or pending? */ return per_cpu(rcu_sched_data, cpu).nxtlist || diff --git a/kernel/rcutree.h b/kernel/rcutree.h index fd2f87db2ab16f..ea32405177c925 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -88,6 +88,7 @@ struct rcu_dynticks { /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ + int wake_gp_end; /* A GP ended, need to wake up CPUs. */ }; /* RCU's kthread states for tracing. */ @@ -467,5 +468,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg); #endif /* #ifdef CONFIG_RCU_BOOST */ static void rcu_cpu_kthread_setrt(int cpu, int to_rt); static void __cpuinit rcu_prepare_kthreads(int cpu); +static void rcu_prepare_for_idle(int cpu); +static void rcu_wake_cpus_for_gp_end(void); +static void rcu_schedule_wake_gp_end(void); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 7a7961feeecf2e..b70ca8cc52e17f 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1953,7 +1953,31 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited); */ int rcu_needs_cpu(int cpu) { - return rcu_needs_cpu_quick_check(cpu); + return rcu_cpu_has_callbacks(cpu); +} + +/* + * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y, + * is nothing. + */ +static void rcu_prepare_for_idle(int cpu) +{ +} + +/* + * CPUs are never putting themselves to sleep with callbacks pending, + * so there is no need to awaken them. + */ +static void rcu_wake_cpus_for_gp_end(void) +{ +} + +/* + * CPUs are never putting themselves to sleep with callbacks pending, + * so there is no need to schedule the act of awakening them. + */ +static void rcu_schedule_wake_gp_end(void) +{ } #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ @@ -1961,47 +1985,56 @@ int rcu_needs_cpu(int cpu) #define RCU_NEEDS_CPU_FLUSHES 5 static DEFINE_PER_CPU(int, rcu_dyntick_drain); static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); +static DEFINE_PER_CPU(bool, rcu_awake_at_gp_end); /* - * Check to see if any future RCU-related work will need to be done - * by the current CPU, even if none need be done immediately, returning - * 1 if so. This function is part of the RCU implementation; it is -not- - * an exported member of the RCU API. + * Allow the CPU to enter dyntick-idle mode if either: (1) There are no + * callbacks on this CPU, (2) this CPU has not yet attempted to enter + * dyntick-idle mode, or (3) this CPU is in the process of attempting to + * enter dyntick-idle mode. Otherwise, if we have recently tried and failed + * to enter dyntick-idle mode, we refuse to try to enter it. After all, + * it is better to incur scheduling-clock interrupts than to spin + * continuously for the same time duration! + */ +int rcu_needs_cpu(int cpu) +{ + /* If no callbacks, RCU doesn't need the CPU. */ + if (!rcu_cpu_has_callbacks(cpu)) + return 0; + /* Otherwise, RCU needs the CPU only if it recently tried and failed. */ + return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies; +} + +/* + * Check to see if any RCU-related work can be done by the current CPU, + * and if so, schedule a softirq to get it done. This function is part + * of the RCU implementation; it is -not- an exported member of the RCU API. * - * Because we are not supporting preemptible RCU, attempt to accelerate - * any current grace periods so that RCU no longer needs this CPU, but - * only if all other CPUs are already in dynticks-idle mode. This will - * allow the CPU cores to be powered down immediately, as opposed to after - * waiting many milliseconds for grace periods to elapse. + * The idea is for the current CPU to clear out all work required by the + * RCU core for the current grace period, so that this CPU can be permitted + * to enter dyntick-idle mode. In some cases, it will need to be awakened + * at the end of the grace period by whatever CPU ends the grace period. + * This allows CPUs to go dyntick-idle more quickly, and to reduce the + * number of wakeups by a modest integer factor. * * Because it is not legal to invoke rcu_process_callbacks() with irqs * disabled, we do one pass of force_quiescent_state(), then do a * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. + * + * The caller must have disabled interrupts. */ -int rcu_needs_cpu(int cpu) +static void rcu_prepare_for_idle(int cpu) { int c = 0; - int snap; - int thatcpu; - /* Check for being in the holdoff period. */ - if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) - return rcu_needs_cpu_quick_check(cpu); - - /* Don't bother unless we are the last non-dyntick-idle CPU. */ - for_each_online_cpu(thatcpu) { - if (thatcpu == cpu) - continue; - snap = atomic_add_return(0, &per_cpu(rcu_dynticks, - thatcpu).dynticks); - smp_mb(); /* Order sampling of snap with end of grace period. */ - if ((snap & 0x1) != 0) { - per_cpu(rcu_dyntick_drain, cpu) = 0; - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; - return rcu_needs_cpu_quick_check(cpu); - } + /* If no callbacks or in the holdoff period, enter dyntick-idle. */ + if (!rcu_cpu_has_callbacks(cpu)) { + per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; + return; } + if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) + return; /* Check and update the rcu_dyntick_drain sequencing. */ if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { @@ -2010,10 +2043,25 @@ int rcu_needs_cpu(int cpu) } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* We have hit the limit, so time to give up. */ per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; - return rcu_needs_cpu_quick_check(cpu); + if (!rcu_pending(cpu)) { + per_cpu(rcu_awake_at_gp_end, cpu) = 1; + return; /* Nothing to do immediately. */ + } + invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ + return; } - /* Do one step pushing remaining RCU callbacks through. */ + /* + * Do one step of pushing the remaining RCU callbacks through + * the RCU core state machine. + */ +#ifdef CONFIG_TREE_PREEMPT_RCU + if (per_cpu(rcu_preempt_data, cpu).nxtlist) { + rcu_preempt_qs(cpu); + force_quiescent_state(&rcu_preempt_state, 0); + c = c || per_cpu(rcu_preempt_data, cpu).nxtlist; + } +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ if (per_cpu(rcu_sched_data, cpu).nxtlist) { rcu_sched_qs(cpu); force_quiescent_state(&rcu_sched_state, 0); @@ -2028,7 +2076,51 @@ int rcu_needs_cpu(int cpu) /* If RCU callbacks are still pending, RCU still needs this CPU. */ if (c) invoke_rcu_core(); - return c; } +/* + * Wake up a CPU by invoking the RCU core. Intended for use by + * rcu_wake_cpus_for_gp_end(), which passes this function to + * smp_call_function_single(). + */ +static void rcu_wake_cpu(void *unused) +{ + invoke_rcu_core(); +} + +/* + * If an RCU grace period ended recently, scan the rcu_awake_at_gp_end + * per-CPU variables, and wake up any CPUs that requested a wakeup. + */ +static void rcu_wake_cpus_for_gp_end(void) +{ + int cpu; + struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); + + if (!rdtp->wake_gp_end) + return; + rdtp->wake_gp_end = 0; + for_each_online_cpu(cpu) { + if (per_cpu(rcu_awake_at_gp_end, cpu)) { + per_cpu(rcu_awake_at_gp_end, cpu) = 0; + smp_call_function_single(cpu, rcu_wake_cpu, NULL, 0); + } + } +} + +/* + * A grace period has just ended, and so we will need to awaken CPUs + * that now have work to do. But we cannot send IPIs with interrupts + * disabled, so just set a flag so that this will happen upon exit + * from RCU core processing. + */ +static void rcu_schedule_wake_gp_end(void) +{ + struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); + + rdtp->wake_gp_end = 1; +} + +/* @@@ need tracing as well. */ + #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ From 11dbaa8cb79a6e4a234a134898436f717a663f01 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Nov 2011 07:38:25 -0700 Subject: [PATCH 29/64] rcu: Fix idle-task checks RCU has traditionally relied on idle_cpu() to determine whether a given CPU is running in the context of an idle task, but commit 908a3283 (Fix idle_cpu()) has invalidated this approach. After commit 908a3283, idle_cpu() will return true if the current CPU is currently running the idle task, and will be doing so for the foreseeable future. RCU instead needs to know whether or not the current CPU is currently running the idle task, regardless of what the near future might bring. This commit therefore switches from idle_cpu() to "current->pid != 0". Reported-by: Wu Fengguang Suggested-by: Carsten Emde Signed-off-by: Paul E. McKenney Acked-by: Steven Rostedt Tested-by: Wu Fengguang Signed-off-by: Paul E. McKenney --- kernel/rcutiny.c | 4 ++-- kernel/rcutree.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 6d70ff71a875cb..4e16ce36fa03fa 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -64,7 +64,7 @@ static void rcu_idle_enter_common(long long oldval) return; } RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting)); - if (!idle_cpu(smp_processor_id())) { + if (current->pid != 0) { struct task_struct *idle = idle_task(smp_processor_id()); RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", @@ -118,7 +118,7 @@ static void rcu_idle_exit_common(long long oldval) return; } RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting)); - if (!idle_cpu(smp_processor_id())) { + if (current->pid != 0) { struct task_struct *idle = idle_task(smp_processor_id()); RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", diff --git a/kernel/rcutree.c b/kernel/rcutree.c index b1711c48a7ec05..49e0783fb200fa 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -355,7 +355,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) return; } trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); - if (!idle_cpu(smp_processor_id())) { + if (current->pid != 0) { struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on entry: not idle task", @@ -449,7 +449,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) smp_mb__after_atomic_inc(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); - if (!idle_cpu(smp_processor_id())) { + if (current->pid != 0) { struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on exit: not idle task", From b807fbff3102bcac76ed9157d834dc20bb3d133b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 3 Nov 2011 14:56:12 -0700 Subject: [PATCH 30/64] rcu: Permit RCU_FAST_NO_HZ to be used by TREE_PREEMPT_RCU The new implementation of RCU_FAST_NO_HZ is compatible with preemptible RCU, so this commit removes the Kconfig restriction that previously prohibited this. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- init/Kconfig | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 43298f9810fba8..82b6a4c675b283 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -469,14 +469,14 @@ config RCU_FANOUT_EXACT config RCU_FAST_NO_HZ bool "Accelerate last non-dyntick-idle CPU's grace periods" - depends on TREE_RCU && NO_HZ && SMP + depends on NO_HZ && SMP default n help This option causes RCU to attempt to accelerate grace periods - in order to allow the final CPU to enter dynticks-idle state - more quickly. On the other hand, this option increases the - overhead of the dynticks-idle checking, particularly on systems - with large numbers of CPUs. + in order to allow CPUs to enter dynticks-idle state more + quickly. On the other hand, this option increases the overhead + of the dynticks-idle checking, particularly on systems with + large numbers of CPUs. Say Y if energy efficiency is critically important, particularly if you have relatively few CPUs. From d5f546d834ddc44539651e9955eb3577b0a3eb8b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 4 Nov 2011 11:44:12 -0700 Subject: [PATCH 31/64] rcu: Add rcutorture system-shutdown capability Although it is easy to run rcutorture tests under KVM, there is currently no nice way to run such a test for a fixed time period, collect all of the rcutorture data, and then shut the system down cleanly. This commit therefore adds an rcutorture module parameter named "shutdown_secs" that specified the run duration in seconds, after which rcutorture terminates the test and powers the system down. The default value for "shutdown_secs" is zero, which disables shutdown. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- Documentation/RCU/torture.txt | 5 +++ kernel/rcutorture.c | 68 ++++++++++++++++++++++++++++++++--- 2 files changed, 69 insertions(+), 4 deletions(-) diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt index 783d6c134d3f00..af40929e1cb0b1 100644 --- a/Documentation/RCU/torture.txt +++ b/Documentation/RCU/torture.txt @@ -66,6 +66,11 @@ shuffle_interval to a particular subset of the CPUs, defaults to 3 seconds. Used in conjunction with test_no_idle_hz. +shutdown_secs The number of seconds to run the test before terminating + the test and powering off the system. The default is + zero, which disables test termination and system shutdown. + This capability is useful for automated testing. + stat_interval The number of seconds between output of torture statistics (via printk()). Regardless of the interval, statistics are printed when the module is unloaded. diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index df35228e743b79..ce84091291b543 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -61,9 +61,10 @@ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ static int stutter = 5; /* Start/stop testing interval (in sec) */ static int irqreader = 1; /* RCU readers from irq (timers). */ -static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ -static int fqs_holdoff = 0; /* Hold time within burst (us). */ +static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ +static int fqs_holdoff; /* Hold time within burst (us). */ static int fqs_stutter = 3; /* Wait time between bursts (s). */ +static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ @@ -91,6 +92,8 @@ module_param(fqs_holdoff, int, 0444); MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); module_param(fqs_stutter, int, 0444); MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); +module_param(shutdown_secs, int, 0444); +MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); module_param(test_boost, int, 0444); MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); module_param(test_boost_interval, int, 0444); @@ -119,6 +122,7 @@ static struct task_struct *shuffler_task; static struct task_struct *stutter_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; +static struct task_struct *shutdown_task; #define RCU_TORTURE_PIPE_LEN 10 @@ -167,6 +171,7 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; #define rcu_can_boost() 0 #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ +static unsigned long shutdown_time; /* jiffies to system shutdown. */ static unsigned long boost_starttime; /* jiffies of next boost test start. */ DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ /* and boost task create/destroy. */ @@ -182,6 +187,9 @@ static int fullstop = FULLSTOP_RMMOD; */ static DEFINE_MUTEX(fullstop_mutex); +/* Forward reference. */ +static void rcu_torture_cleanup(void); + /* * Detect and respond to a system shutdown. */ @@ -1250,12 +1258,12 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) "shuffle_interval=%d stutter=%d irqreader=%d " "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " "test_boost=%d/%d test_boost_interval=%d " - "test_boost_duration=%d\n", + "test_boost_duration=%d shutdown_secs=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, test_boost, cur_ops->can_boost, - test_boost_interval, test_boost_duration); + test_boost_interval, test_boost_duration, shutdown_secs); } static struct notifier_block rcutorture_shutdown_nb = { @@ -1305,6 +1313,43 @@ static int rcutorture_booster_init(int cpu) return 0; } +/* + * Cause the rcutorture test to shutdown the system after the test has + * run for the time specified by the shutdown_secs module parameter. + */ +static int +rcu_torture_shutdown(void *arg) +{ + long delta; + unsigned long jiffies_snap; + + VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started"); + jiffies_snap = ACCESS_ONCE(jiffies); + while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && + !kthread_should_stop()) { + delta = shutdown_time - jiffies_snap; + if (verbose) + printk(KERN_ALERT "%s" TORTURE_FLAG + "rcu_torture_shutdown task: %lu " + "jiffies remaining\n", + torture_type, delta); + schedule_timeout_interruptible(delta); + jiffies_snap = ACCESS_ONCE(jiffies); + } + if (ULONG_CMP_LT(jiffies, shutdown_time)) { + VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping"); + return 0; + } + + /* OK, shut down the system. */ + + VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system"); + shutdown_task = NULL; /* Avoid self-kill deadlock. */ + rcu_torture_cleanup(); /* Get the success/failure message. */ + kernel_power_off(); /* Shut down the system. */ + return 0; +} + static int rcutorture_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -1409,6 +1454,10 @@ rcu_torture_cleanup(void) for_each_possible_cpu(i) rcutorture_booster_cleanup(i); } + if (shutdown_task != NULL) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); + kthread_stop(shutdown_task); + } /* Wait for all RCU callbacks to fire. */ @@ -1625,6 +1674,17 @@ rcu_torture_init(void) } } } + if (shutdown_secs > 0) { + shutdown_time = jiffies + shutdown_secs * HZ; + shutdown_task = kthread_run(rcu_torture_shutdown, NULL, + "rcu_torture_shutdown"); + if (IS_ERR(shutdown_task)) { + firsterr = PTR_ERR(shutdown_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); + shutdown_task = NULL; + goto unwind; + } + } register_reboot_notifier(&rcutorture_shutdown_nb); rcutorture_record_test_transition(); mutex_unlock(&fullstop_mutex); From bb3bf7052de520f2d21a1275e95fac7a84d89e4c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 4 Nov 2011 13:24:07 -0700 Subject: [PATCH 32/64] rcu: Control rcutorture startup from kernel boot parameters Currently, if rcutorture is built into the kernel, it must be manually started or started from an init script. This is inconvenient for automated KVM testing, where it is good to be able to fully control rcutorture execution from the kernel parameters. This patch therefore adds a module parameter named "rcutorture_runnable" that defaults to zero ("don't start automatically"), but which can be set to one to cause rcutorture to start up immediately during boot. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index ce84091291b543..eed9f46eb0a5c1 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -164,6 +164,8 @@ static int stutter_pause_test; #define RCUTORTURE_RUNNABLE_INIT 0 #endif int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; +module_param(rcutorture_runnable, int, 0444); +MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) #define rcu_can_boost() 1 From c4f3060843506ba6d473ab9a0afe5bd5dc93a00d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 Nov 2011 12:41:56 -0800 Subject: [PATCH 33/64] sched: Add is_idle_task() to handle invalidated uses of idle_cpu() Commit 908a3283 (Fix idle_cpu()) invalidated some uses of idle_cpu(), which used to say whether or not the CPU was running the idle task, but now instead says whether or not the CPU is running the idle task in the absence of pending wakeups. Although this new implementation gives a better answer to the question "is this CPU idle?", it also invalidates other uses that were made of idle_cpu(). This commit therefore introduces a new is_idle_task() API member that determines whether or not the specified task is one of the idle tasks, allowing open-coded "->pid == 0" sequences to be replaced by something more meaningful. Suggested-by: Josh Triplett Suggested-by: Peter Zijlstra Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/linux/sched.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 1c4f3e9b9bc50e..4a7e4d333a275c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2070,6 +2070,14 @@ extern int sched_setscheduler(struct task_struct *, int, extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); extern struct task_struct *idle_task(int cpu); +/** + * is_idle_task - is the specified task an idle task? + * @tsk: the task in question. + */ +static inline bool is_idle_task(struct task_struct *p) +{ + return p->pid == 0; +} extern struct task_struct *curr_task(int cpu); extern void set_curr_task(int cpu, struct task_struct *p); From 99745b6a83414006f5c1e83efaebb423b41b67ef Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 Nov 2011 15:48:45 -0800 Subject: [PATCH 34/64] rcu: Make RCU use the new is_idle_task() API Change from direct comparison of ->pid with zero to is_idle_task(). Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutiny.c | 4 ++-- kernel/rcutree.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 4e16ce36fa03fa..e5bd94954fa3f7 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -64,7 +64,7 @@ static void rcu_idle_enter_common(long long oldval) return; } RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting)); - if (current->pid != 0) { + if (!is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", @@ -118,7 +118,7 @@ static void rcu_idle_exit_common(long long oldval) return; } RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting)); - if (current->pid != 0) { + if (!is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 49e0783fb200fa..7fb8b0e608116e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -355,7 +355,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) return; } trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); - if (current->pid != 0) { + if (!is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on entry: not idle task", @@ -449,7 +449,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) smp_mb__after_atomic_inc(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); - if (current->pid != 0) { + if (!is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on exit: not idle task", From 29f043a2caea2860db36fbeda0c17f79bf0cffbe Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 Nov 2011 15:56:46 -0800 Subject: [PATCH 35/64] sparc: Make SPARC use the new is_idle_task() API Change from direct comparison of ->pid with zero to is_idle_task(). Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Acked-by: David S. Miller Reviewed-by: Josh Triplett --- arch/sparc/kernel/setup_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sparc/kernel/setup_32.c b/arch/sparc/kernel/setup_32.c index fe1e3fc31bc580..ffb883ddd0f010 100644 --- a/arch/sparc/kernel/setup_32.c +++ b/arch/sparc/kernel/setup_32.c @@ -84,7 +84,7 @@ static void prom_sync_me(void) prom_printf("PROM SYNC COMMAND...\n"); show_free_areas(0); - if(current->pid != 0) { + if (!is_idle_task(current)) { local_irq_enable(); sys_sync(); local_irq_disable(); From 7fc20c5cbdd184f32cb0f886f1a069f123f5787a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 Nov 2011 15:59:58 -0800 Subject: [PATCH 36/64] kdb: Make KDB use the new is_idle_task() API Change from direct comparison of ->pid with zero to is_idle_task(). Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Cc: Jason Wessel Reviewed-by: Josh Triplett --- kernel/debug/kdb/kdb_support.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 5532dd37aa864e..7d6fb40d21885c 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -636,7 +636,7 @@ char kdb_task_state_char (const struct task_struct *p) (p->exit_state & EXIT_ZOMBIE) ? 'Z' : (p->exit_state & EXIT_DEAD) ? 'E' : (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?'; - if (p->pid == 0) { + if (is_idle_task(p)) { /* Idle task. Is it really idle, apart from the kdb * interrupt? */ if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) { From 77aeeebd7b5483582d0eb6e3fd2894771d1fd8e5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 Nov 2011 16:02:52 -0800 Subject: [PATCH 37/64] events: Make events use the new is_idle_task() API Change from direct comparison of ->pid with zero to is_idle_task(). Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Reviewed-by: Josh Triplett --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index d3b9df5962c25b..923c6b5667dbe5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5362,7 +5362,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) regs = get_irq_regs(); if (regs && !perf_exclude_event(event, regs)) { - if (!(event->attr.exclude_idle && current->pid == 0)) + if (!(event->attr.exclude_idle && is_idle_task(current))) if (perf_event_overflow(event, &data, regs)) ret = HRTIMER_NORESTART; } From a95f8817f8afa75ab41728bd1bb65024c65c91c3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 Nov 2011 16:16:13 -0800 Subject: [PATCH 38/64] tile: Make tile use the new is_idle_task() API Change from direct comparison of ->pid with zero to is_idle_task(). Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Acked-by: Chris Metcalf --- arch/tile/mm/fault.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c index 25b7b90fd62064..c1eaaa1fcc2089 100644 --- a/arch/tile/mm/fault.c +++ b/arch/tile/mm/fault.c @@ -54,7 +54,7 @@ static noinline void force_sig_info_fault(const char *type, int si_signo, if (unlikely(tsk->pid < 2)) { panic("Signal %d (code %d) at %#lx sent to %s!", si_signo, si_code & 0xffff, address, - tsk->pid ? "init" : "the idle task"); + is_idle_task(tsk) ? "the idle task" : "init"); } info.si_signo = si_signo; @@ -515,7 +515,7 @@ static int handle_page_fault(struct pt_regs *regs, if (unlikely(tsk->pid < 2)) { panic("Kernel page fault running %s!", - tsk->pid ? "init" : "the idle task"); + is_idle_task(tsk) ? "the idle task" : "init"); } /* From b58bdccaa8d908e0f71dae396468a0d3f7bb3125 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Nov 2011 17:48:21 -0800 Subject: [PATCH 39/64] rcu: Add rcutorture CPU-hotplug capability Running CPU-hotplug operations concurrently with rcutorture has historically been a good way to find bugs in both RCU and CPU hotplug. This commit therefore adds an rcutorture module parameter called "onoff_interval" that causes a randomly selected CPU-hotplug operation to be executed at the specified interval, in seconds. The default value of "onoff_interval" is zero, which disables rcutorture-instigated CPU-hotplug operations. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- Documentation/RCU/torture.txt | 8 +++ kernel/rcutorture.c | 117 ++++++++++++++++++++++++++++++++-- 2 files changed, 120 insertions(+), 5 deletions(-) diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt index af40929e1cb0b1..d67068d0d2b981 100644 --- a/Documentation/RCU/torture.txt +++ b/Documentation/RCU/torture.txt @@ -61,6 +61,14 @@ nreaders This is the number of RCU reading threads supported. To properly exercise RCU implementations with preemptible read-side critical sections. +onoff_interval + The number of seconds between each attempt to execute a + randomly selected CPU-hotplug operation. Defaults to + zero, which disables CPU hotplugging. In HOTPLUG_CPU=n + kernels, rcutorture will silently refuse to do any + CPU-hotplug operations regardless of what value is + specified for onoff_interval. + shuffle_interval The number of seconds to keep the test threads affinitied to a particular subset of the CPUs, defaults to 3 seconds. diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index eed9f46eb0a5c1..1e422ae1506b3a 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -64,6 +64,7 @@ static int irqreader = 1; /* RCU readers from irq (timers). */ static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ static int fqs_holdoff; /* Hold time within burst (us). */ static int fqs_stutter = 3; /* Wait time between bursts (s). */ +static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ @@ -92,6 +93,8 @@ module_param(fqs_holdoff, int, 0444); MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); module_param(fqs_stutter, int, 0444); MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); +module_param(onoff_interval, int, 0444); +MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); module_param(shutdown_secs, int, 0444); MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); module_param(test_boost, int, 0444); @@ -123,6 +126,9 @@ static struct task_struct *stutter_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; static struct task_struct *shutdown_task; +#ifdef CONFIG_HOTPLUG_CPU +static struct task_struct *onoff_task; +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ #define RCU_TORTURE_PIPE_LEN 10 @@ -153,6 +159,10 @@ static long n_rcu_torture_boost_rterror; static long n_rcu_torture_boost_failure; static long n_rcu_torture_boosts; static long n_rcu_torture_timers; +static long n_offline_attempts; +static long n_offline_successes; +static long n_online_attempts; +static long n_online_successes; static struct list_head rcu_torture_removed; static cpumask_var_t shuffle_tmp_mask; @@ -1084,7 +1094,8 @@ rcu_torture_printk(char *page) cnt += sprintf(&page[cnt], "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " "rtmbe: %d rtbke: %ld rtbre: %ld " - "rtbf: %ld rtb: %ld nt: %ld", + "rtbf: %ld rtb: %ld nt: %ld " + "onoff: %ld/%ld:%ld/%ld", rcu_torture_current, rcu_torture_current_version, list_empty(&rcu_torture_freelist), @@ -1096,7 +1107,11 @@ rcu_torture_printk(char *page) n_rcu_torture_boost_rterror, n_rcu_torture_boost_failure, n_rcu_torture_boosts, - n_rcu_torture_timers); + n_rcu_torture_timers, + n_online_successes, + n_online_attempts, + n_offline_successes, + n_offline_attempts); if (atomic_read(&n_rcu_torture_mberror) != 0 || n_rcu_torture_boost_ktrerror != 0 || n_rcu_torture_boost_rterror != 0 || @@ -1260,12 +1275,14 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) "shuffle_interval=%d stutter=%d irqreader=%d " "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " "test_boost=%d/%d test_boost_interval=%d " - "test_boost_duration=%d shutdown_secs=%d\n", + "test_boost_duration=%d shutdown_secs=%d " + "onoff_interval=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, test_boost, cur_ops->can_boost, - test_boost_interval, test_boost_duration, shutdown_secs); + test_boost_interval, test_boost_duration, shutdown_secs, + onoff_interval); } static struct notifier_block rcutorture_shutdown_nb = { @@ -1338,7 +1355,7 @@ rcu_torture_shutdown(void *arg) schedule_timeout_interruptible(delta); jiffies_snap = ACCESS_ONCE(jiffies); } - if (ULONG_CMP_LT(jiffies, shutdown_time)) { + if (kthread_should_stop()) { VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping"); return 0; } @@ -1352,6 +1369,94 @@ rcu_torture_shutdown(void *arg) return 0; } +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Execute random CPU-hotplug operations at the interval specified + * by the onoff_interval. + */ +static int +rcu_torture_onoff(void *arg) +{ + int cpu; + int maxcpu = -1; + DEFINE_RCU_RANDOM(rand); + + VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); + for_each_online_cpu(cpu) + maxcpu = cpu; + WARN_ON(maxcpu < 0); + while (!kthread_should_stop()) { + cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); + if (cpu_online(cpu)) { + if (verbose) + printk(KERN_ALERT "%s" TORTURE_FLAG + "rcu_torture_onoff task: offlining %d\n", + torture_type, cpu); + n_offline_attempts++; + if (cpu_down(cpu) == 0) { + if (verbose) + printk(KERN_ALERT "%s" TORTURE_FLAG + "rcu_torture_onoff task: " + "offlined %d\n", + torture_type, cpu); + n_offline_successes++; + } + } else { + if (verbose) + printk(KERN_ALERT "%s" TORTURE_FLAG + "rcu_torture_onoff task: onlining %d\n", + torture_type, cpu); + n_online_attempts++; + if (cpu_up(cpu) == 0) { + if (verbose) + printk(KERN_ALERT "%s" TORTURE_FLAG + "rcu_torture_onoff task: " + "onlined %d\n", + torture_type, cpu); + n_online_successes++; + } + } + schedule_timeout_interruptible(onoff_interval * HZ); + } + VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping"); + return 0; +} + +static int +rcu_torture_onoff_init(void) +{ + if (onoff_interval <= 0) + return 0; + onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff"); + if (IS_ERR(onoff_task)) { + onoff_task = NULL; + return PTR_ERR(onoff_task); + } + return 0; +} + +static void rcu_torture_onoff_cleanup(void) +{ + if (onoff_task == NULL) + return; + VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); + kthread_stop(onoff_task); +} + +#else /* #ifdef CONFIG_HOTPLUG_CPU */ + +static void +rcu_torture_onoff_init(void) +{ +} + +static void rcu_torture_onoff_cleanup(void) +{ +} + +#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ + static int rcutorture_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -1460,6 +1565,7 @@ rcu_torture_cleanup(void) VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); kthread_stop(shutdown_task); } + rcu_torture_onoff_cleanup(); /* Wait for all RCU callbacks to fire. */ @@ -1687,6 +1793,7 @@ rcu_torture_init(void) goto unwind; } } + rcu_torture_onoff_init(); register_reboot_notifier(&rcutorture_shutdown_nb); rcutorture_record_test_transition(); mutex_unlock(&fullstop_mutex); From 1268fbc746ea1cd279886a740dcbad4ba5232225 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 17 Nov 2011 18:48:14 +0100 Subject: [PATCH 40/64] nohz: Remove tick_nohz_idle_enter_norcu() / tick_nohz_idle_exit_norcu() Those two APIs were provided to optimize the calls of tick_nohz_idle_enter() and rcu_idle_enter() into a single irq disabled section. This way no interrupt happening in-between would needlessly process any RCU job. Now we are talking about an optimization for which benefits have yet to be measured. Let's start simple and completely decouple idle rcu and dyntick idle logics to simplify. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Reviewed-by: Josh Triplett Signed-off-by: Paul E. McKenney --- arch/arm/kernel/process.c | 6 ++-- arch/avr32/kernel/process.c | 6 ++-- arch/blackfin/kernel/process.c | 6 ++-- arch/microblaze/kernel/process.c | 6 ++-- arch/mips/kernel/process.c | 6 ++-- arch/openrisc/kernel/idle.c | 6 ++-- arch/powerpc/kernel/idle.c | 15 ++++---- arch/powerpc/platforms/iseries/setup.c | 12 ++++--- arch/s390/kernel/process.c | 6 ++-- arch/sh/kernel/idle.c | 6 ++-- arch/sparc/kernel/process_64.c | 6 ++-- arch/tile/kernel/process.c | 6 ++-- arch/um/kernel/process.c | 6 ++-- arch/unicore32/kernel/process.c | 6 ++-- arch/x86/kernel/process_32.c | 6 ++-- include/linux/tick.h | 47 +------------------------- kernel/time/tick-sched.c | 15 ++++---- 17 files changed, 76 insertions(+), 91 deletions(-) diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 47e34c091276f4..e8e8fe505df140 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -183,7 +183,8 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter_norcu(); + tick_nohz_idle_enter(); + rcu_idle_enter(); leds_event(led_idle_start); while (!need_resched()) { #ifdef CONFIG_HOTPLUG_CPU @@ -213,7 +214,8 @@ void cpu_idle(void) } } leds_event(led_idle_end); - tick_nohz_idle_exit_norcu(); + rcu_idle_exit(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c index 34c8c703bb161e..ea33957503243c 100644 --- a/arch/avr32/kernel/process.c +++ b/arch/avr32/kernel/process.c @@ -34,10 +34,12 @@ void cpu_idle(void) { /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter_norcu(); + tick_nohz_idle_enter(); + rcu_idle_enter(); while (!need_resched()) cpu_idle_sleep(); - tick_nohz_idle_exit_norcu(); + rcu_idle_exit(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c index 57e07498a0e76d..8dd0416673cb56 100644 --- a/arch/blackfin/kernel/process.c +++ b/arch/blackfin/kernel/process.c @@ -88,10 +88,12 @@ void cpu_idle(void) #endif if (!idle) idle = default_idle; - tick_nohz_idle_enter_norcu(); + tick_nohz_idle_enter(); + rcu_idle_enter(); while (!need_resched()) idle(); - tick_nohz_idle_exit_norcu(); + rcu_idle_exit(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index 13d59f34b94e15..7dcb5bfffb7552 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -103,10 +103,12 @@ void cpu_idle(void) if (!idle) idle = default_idle; - tick_nohz_idle_enter_norcu(); + tick_nohz_idle_enter(); + rcu_idle_enter(); while (!need_resched()) idle(); - tick_nohz_idle_exit_norcu(); + rcu_idle_exit(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index 17fb3a270160d3..7955409051c479 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -56,7 +56,8 @@ void __noreturn cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter_norcu(); + tick_nohz_idle_enter(); + rcu_idle_enter(); while (!need_resched() && cpu_online(cpu)) { #ifdef CONFIG_MIPS_MT_SMTC extern void smtc_idle_loop_hook(void); @@ -77,7 +78,8 @@ void __noreturn cpu_idle(void) system_state == SYSTEM_BOOTING)) play_dead(); #endif - tick_nohz_idle_exit_norcu(); + rcu_idle_exit(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/openrisc/kernel/idle.c b/arch/openrisc/kernel/idle.c index 2e82cd0fa5e193..e5fc7887783069 100644 --- a/arch/openrisc/kernel/idle.c +++ b/arch/openrisc/kernel/idle.c @@ -51,7 +51,8 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter_norcu(); + tick_nohz_idle_enter(); + rcu_idle_enter(); while (!need_resched()) { check_pgt_cache(); @@ -69,7 +70,8 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); } - tick_nohz_idle_exit_norcu(); + rcu_idle_exit(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index 3cd73d1fc427b3..9c3cd490b1bd97 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -62,10 +62,10 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while (1) { - if (idle_uses_rcu) - tick_nohz_idle_enter(); - else - tick_nohz_idle_enter_norcu(); + tick_nohz_idle_enter(); + if (!idle_uses_rcu) + rcu_idle_enter(); + while (!need_resched() && !cpu_should_die()) { ppc64_runlatch_off(); @@ -102,10 +102,9 @@ void cpu_idle(void) HMT_medium(); ppc64_runlatch_on(); - if (idle_uses_rcu) - tick_nohz_idle_exit(); - else - tick_nohz_idle_exit_norcu(); + if (!idle_uses_rcu) + rcu_idle_exit(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); if (cpu_should_die()) cpu_die(); diff --git a/arch/powerpc/platforms/iseries/setup.c b/arch/powerpc/platforms/iseries/setup.c index d69d3d185e8928..8fc62586a973a8 100644 --- a/arch/powerpc/platforms/iseries/setup.c +++ b/arch/powerpc/platforms/iseries/setup.c @@ -563,7 +563,8 @@ static void yield_shared_processor(void) static void iseries_shared_idle(void) { while (1) { - tick_nohz_idle_enter_norcu(); + tick_nohz_idle_enter(); + rcu_idle_enter(); while (!need_resched() && !hvlpevent_is_pending()) { local_irq_disable(); ppc64_runlatch_off(); @@ -577,7 +578,8 @@ static void iseries_shared_idle(void) } ppc64_runlatch_on(); - tick_nohz_idle_exit_norcu(); + rcu_idle_exit(); + tick_nohz_idle_exit(); if (hvlpevent_is_pending()) process_iSeries_events(); @@ -593,7 +595,8 @@ static void iseries_dedicated_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while (1) { - tick_nohz_idle_enter_norcu(); + tick_nohz_idle_enter(); + rcu_idle_enter(); if (!need_resched()) { while (!need_resched()) { ppc64_runlatch_off(); @@ -610,7 +613,8 @@ static void iseries_dedicated_idle(void) } ppc64_runlatch_on(); - tick_nohz_idle_exit_norcu(); + rcu_idle_exit(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 6fa987367ae670..3201ae447990c5 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -91,10 +91,12 @@ static void default_idle(void) void cpu_idle(void) { for (;;) { - tick_nohz_idle_enter_norcu(); + tick_nohz_idle_enter(); + rcu_idle_enter(); while (!need_resched()) default_idle(); - tick_nohz_idle_exit_norcu(); + rcu_idle_exit(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c index ad58e7535a7c47..406508d4ce742d 100644 --- a/arch/sh/kernel/idle.c +++ b/arch/sh/kernel/idle.c @@ -89,7 +89,8 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter_norcu(); + tick_nohz_idle_enter(); + rcu_idle_enter(); while (!need_resched()) { check_pgt_cache(); @@ -111,7 +112,8 @@ void cpu_idle(void) start_critical_timings(); } - tick_nohz_idle_exit_norcu(); + rcu_idle_exit(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 4a0e7d79cb926e..39d8b05201a23b 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -95,12 +95,14 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while(1) { - tick_nohz_idle_enter_norcu(); + tick_nohz_idle_enter(); + rcu_idle_enter(); while (!need_resched() && !cpu_is_offline(cpu)) sparc64_yield(cpu); - tick_nohz_idle_exit_norcu(); + rcu_idle_exit(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c index 53ac89595ab153..4c1ac6e5347ac6 100644 --- a/arch/tile/kernel/process.c +++ b/arch/tile/kernel/process.c @@ -85,7 +85,8 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter_norcu(); + tick_nohz_idle_enter(); + rcu_idle_enter(); while (!need_resched()) { if (cpu_is_offline(cpu)) BUG(); /* no HOTPLUG_CPU */ @@ -105,7 +106,8 @@ void cpu_idle(void) local_irq_enable(); current_thread_info()->status |= TS_POLLING; } - tick_nohz_idle_exit_norcu(); + rcu_idle_exit(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 55d2cf455f630b..69f24905abdc43 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -246,10 +246,12 @@ void default_idle(void) if (need_resched()) schedule(); - tick_nohz_idle_enter_norcu(); + tick_nohz_idle_enter(); + rcu_idle_enter(); nsecs = disable_timer(); idle_sleep(nsecs); - tick_nohz_idle_exit_norcu(); + rcu_idle_exit(); + tick_nohz_idle_exit(); } } diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c index 095ff5a57928fb..52edc2b6287325 100644 --- a/arch/unicore32/kernel/process.c +++ b/arch/unicore32/kernel/process.c @@ -55,7 +55,8 @@ void cpu_idle(void) { /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter_norcu(); + tick_nohz_idle_enter(); + rcu_idle_enter(); while (!need_resched()) { local_irq_disable(); stop_critical_timings(); @@ -63,7 +64,8 @@ void cpu_idle(void) local_irq_enable(); start_critical_timings(); } - tick_nohz_idle_exit_norcu(); + rcu_idle_exit(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index f94da3920c36bc..485204f58cdaaf 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -99,7 +99,8 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter_norcu(); + tick_nohz_idle_enter(); + rcu_idle_enter(); while (!need_resched()) { check_pgt_cache(); @@ -116,7 +117,8 @@ void cpu_idle(void) pm_idle(); start_critical_timings(); } - tick_nohz_idle_exit_norcu(); + rcu_idle_exit(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/include/linux/tick.h b/include/linux/tick.h index 327434a0575725..ab8be90b5cc957 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -122,45 +122,8 @@ static inline int tick_oneshot_mode_active(void) { return 0; } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ # ifdef CONFIG_NO_HZ -extern void __tick_nohz_idle_enter(void); -static inline void tick_nohz_idle_enter(void) -{ - local_irq_disable(); - __tick_nohz_idle_enter(); - local_irq_enable(); -} +extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_exit(void); - -/* - * Call this pair of function if the arch doesn't make any use - * of RCU in-between. You won't need to call rcu_idle_enter() and - * rcu_idle_exit(). - * Otherwise you need to call tick_nohz_idle_enter() and tick_nohz_idle_exit() - * and explicitly tell RCU about the window around the place the CPU enters low - * power mode where no RCU use is made. This is done by calling rcu_idle_enter() - * after the last use of RCU before the CPU is put to sleep and by calling - * rcu_idle_exit() before the first use of RCU after the CPU woke up. - */ -static inline void tick_nohz_idle_enter_norcu(void) -{ - /* - * Also call rcu_idle_enter() in the irq disabled section even - * if it disables irq itself. - * Just an optimization that prevents from an interrupt happening - * between it and __tick_nohz_idle_enter() to lose time to help - * completing a grace period while we could be in extended grace - * period already. - */ - local_irq_disable(); - __tick_nohz_idle_enter(); - rcu_idle_enter(); - local_irq_enable(); -} -static inline void tick_nohz_idle_exit_norcu(void) -{ - rcu_idle_exit(); - tick_nohz_idle_exit(); -} extern void tick_nohz_irq_exit(void); extern ktime_t tick_nohz_get_sleep_length(void); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); @@ -168,14 +131,6 @@ extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); # else static inline void tick_nohz_idle_enter(void) { } static inline void tick_nohz_idle_exit(void) { } -static inline void tick_nohz_idle_enter_norcu(void) -{ - rcu_idle_enter(); -} -static inline void tick_nohz_idle_exit_norcu(void) -{ - rcu_idle_exit(); -} static inline ktime_t tick_nohz_get_sleep_length(void) { diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index c76aefe764b0f1..0ec8b832ab6bfe 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -454,21 +454,20 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) * When the next event is more than a tick into the future, stop the idle tick * Called when we start the idle loop. * - * If no use of RCU is made in the idle loop between - * tick_nohz_idle_enter() and tick_nohz_idle_exit() calls, then - * tick_nohz_idle_enter_norcu() should be called instead and the arch - * doesn't need to call rcu_idle_enter() and rcu_idle_exit() explicitly. - * - * Otherwise the arch is responsible of calling: + * The arch is responsible of calling: * * - rcu_idle_enter() after its last use of RCU before the CPU is put * to sleep. * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. */ -void __tick_nohz_idle_enter(void) +void tick_nohz_idle_enter(void) { struct tick_sched *ts; + WARN_ON_ONCE(irqs_disabled()); + + local_irq_disable(); + ts = &__get_cpu_var(tick_cpu_sched); /* * set ts->inidle unconditionally. even if the system did not @@ -477,6 +476,8 @@ void __tick_nohz_idle_enter(void) */ ts->inidle = 1; tick_nohz_stop_sched_tick(ts); + + local_irq_enable(); } /** From 182dd4b277177e8465ad11cd9f85f282946b5578 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 Nov 2011 10:55:12 -0800 Subject: [PATCH 41/64] doc: Add load/store guarantees to Documentation/atomic-ops.txt An IRC discussion uncovered many conflicting opinions on what types of data may be atomically loaded and stored. This commit therefore calls this out the official set: pointers, longs, ints, and chars (but not shorts). This commit also gives some examples of compiler mischief that can thwart atomicity. Please note that this discussion is relevant to !SMP kernels if CONFIG_PREEMPT=y: preemption can cause almost as much trouble as can SMP. Signed-off-by: Paul E. McKenney Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Russell King Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Mike Frysinger Cc: Mikael Starvik Cc: Jesper Nilsson Cc: David Howells Cc: Yoshinori Sato Cc: Richard Kuo Cc: Jes Sorensen Cc: Hirokazu Takata Cc: Geert Uytterhoeven Cc: Michal Simek Cc: Ralf Baechle Cc: Koichi Yasutake Cc: Jonas Bonn Cc: Kyle McMartin Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Chen Liqin Cc: Lennox Wu Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Metcalf Cc: Jeff Dike Cc: Richard Weinberger Cc: Guan Xuetao Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Chris Zankel --- Documentation/atomic_ops.txt | 87 ++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/Documentation/atomic_ops.txt b/Documentation/atomic_ops.txt index 3bd585b449270a..27f2b21a9d5cd5 100644 --- a/Documentation/atomic_ops.txt +++ b/Documentation/atomic_ops.txt @@ -84,6 +84,93 @@ compiler optimizes the section accessing atomic_t variables. *** YOU HAVE BEEN WARNED! *** +Properly aligned pointers, longs, ints, and chars (and unsigned +equivalents) may be atomically loaded from and stored to in the same +sense as described for atomic_read() and atomic_set(). The ACCESS_ONCE() +macro should be used to prevent the compiler from using optimizations +that might otherwise optimize accesses out of existence on the one hand, +or that might create unsolicited accesses on the other. + +For example consider the following code: + + while (a > 0) + do_something(); + +If the compiler can prove that do_something() does not store to the +variable a, then the compiler is within its rights transforming this to +the following: + + tmp = a; + if (a > 0) + for (;;) + do_something(); + +If you don't want the compiler to do this (and you probably don't), then +you should use something like the following: + + while (ACCESS_ONCE(a) < 0) + do_something(); + +Alternatively, you could place a barrier() call in the loop. + +For another example, consider the following code: + + tmp_a = a; + do_something_with(tmp_a); + do_something_else_with(tmp_a); + +If the compiler can prove that do_something_with() does not store to the +variable a, then the compiler is within its rights to manufacture an +additional load as follows: + + tmp_a = a; + do_something_with(tmp_a); + tmp_a = a; + do_something_else_with(tmp_a); + +This could fatally confuse your code if it expected the same value +to be passed to do_something_with() and do_something_else_with(). + +The compiler would be likely to manufacture this additional load if +do_something_with() was an inline function that made very heavy use +of registers: reloading from variable a could save a flush to the +stack and later reload. To prevent the compiler from attacking your +code in this manner, write the following: + + tmp_a = ACCESS_ONCE(a); + do_something_with(tmp_a); + do_something_else_with(tmp_a); + +For a final example, consider the following code, assuming that the +variable a is set at boot time before the second CPU is brought online +and never changed later, so that memory barriers are not needed: + + if (a) + b = 9; + else + b = 42; + +The compiler is within its rights to manufacture an additional store +by transforming the above code into the following: + + b = 42; + if (a) + b = 9; + +This could come as a fatal surprise to other code running concurrently +that expected b to never have the value 42 if a was zero. To prevent +the compiler from doing this, write something like: + + if (a) + ACCESS_ONCE(b) = 9; + else + ACCESS_ONCE(b) = 42; + +Don't even -think- about doing this without proper use of memory barriers, +locks, or atomic operations if variable a can change at runtime! + +*** WARNING: ACCESS_ONCE() DOES NOT IMPLY A BARRIER! *** + Now, we move onto the atomic operation interfaces typically implemented with the help of assembly code. From 045fb9315a2129023d70a0eecf0942e18fca4fcd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 Nov 2011 12:13:03 -0800 Subject: [PATCH 42/64] rcu: Update trace_rcu_dyntick() header comment This commit updates the trace_rcu_dyntick() header comment to reflect events added by commit 4b4f421. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index c29fb2f5590907..7f6877a3505147 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -241,8 +241,16 @@ TRACE_EVENT(rcu_fqs, /* * Tracepoint for dyntick-idle entry/exit events. These take a string - * as argument: "Start" for entering dyntick-idle mode and "End" for - * leaving it. + * as argument: "Start" for entering dyntick-idle mode, "End" for + * leaving it, "--=" for events moving towards idle, and "++=" for events + * moving away from idle. "Error on entry: not idle task" and "Error on + * exit: not idle task" indicate that a non-idle task is erroneously + * toying with the idle loop. + * + * These events also take a pair of numbers, which indicate the nesting + * depth before and after the event of interest. Note that task-related + * events use the upper bits of each number, while interrupt-related + * events use the lower bits. */ TRACE_EVENT(rcu_dyntick, From 433cdddcd9ac5558068edd7f8d4707a70f7710f5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 Nov 2011 14:58:03 -0800 Subject: [PATCH 43/64] rcu: Add tracing for RCU_FAST_NO_HZ This commit adds trace_rcu_prep_idle(), which is invoked from rcu_prepare_for_idle() and rcu_wake_cpu() to trace attempts on the part of RCU to force CPUs into dyntick-idle mode. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 37 +++++++++++++++++++++++++++++++++++++ kernel/rcutree_plugin.h | 18 +++++++++++++++--- 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 7f6877a3505147..debe453c96230e 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -274,6 +274,42 @@ TRACE_EVENT(rcu_dyntick, __entry->oldnesting, __entry->newnesting) ); +/* + * Tracepoint for RCU preparation for idle, the goal being to get RCU + * processing done so that the current CPU can shut off its scheduling + * clock and enter dyntick-idle mode. One way to accomplish this is + * to drain all RCU callbacks from this CPU, and the other is to have + * done everything RCU requires for the current grace period. In this + * latter case, the CPU will be awakened at the end of the current grace + * period in order to process the remainder of its callbacks. + * + * These tracepoints take a string as argument: + * + * "No callbacks": Nothing to do, no callbacks on this CPU. + * "In holdoff": Nothing to do, holding off after unsuccessful attempt. + * "Dyntick with callbacks": Callbacks remain, but RCU doesn't need CPU. + * "Begin holdoff": Attempt failed, don't retry until next jiffy. + * "More callbacks": Still more callbacks, try again to clear them out. + * "Callbacks drained": All callbacks processed, off to dyntick idle! + * "CPU awakened at GP end": + */ +TRACE_EVENT(rcu_prep_idle, + + TP_PROTO(char *reason), + + TP_ARGS(reason), + + TP_STRUCT__entry( + __field(char *, reason) + ), + + TP_fast_assign( + __entry->reason = reason; + ), + + TP_printk("%s", __entry->reason) +); + /* * Tracepoint for the registration of a single RCU callback function. * The first argument is the type of RCU, the second argument is @@ -482,6 +518,7 @@ TRACE_EVENT(rcu_torture_read, #define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks) do { } while (0) #define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0) #define trace_rcu_dyntick(polarity, oldnesting, newnesting) do { } while (0) +#define trace_rcu_prep_idle(reason) do { } while (0) #define trace_rcu_callback(rcuname, rhp, qlen) do { } while (0) #define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen) do { } while (0) #define trace_rcu_batch_start(rcuname, qlen, blimit) do { } while (0) diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index b70ca8cc52e17f..6467f5669ab71c 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2031,10 +2031,13 @@ static void rcu_prepare_for_idle(int cpu) /* If no callbacks or in the holdoff period, enter dyntick-idle. */ if (!rcu_cpu_has_callbacks(cpu)) { per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; + trace_rcu_prep_idle("No callbacks"); return; } - if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) + if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { + trace_rcu_prep_idle("In holdoff"); return; + } /* Check and update the rcu_dyntick_drain sequencing. */ if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { @@ -2044,9 +2047,11 @@ static void rcu_prepare_for_idle(int cpu) /* We have hit the limit, so time to give up. */ per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; if (!rcu_pending(cpu)) { + trace_rcu_prep_idle("Dyntick with callbacks"); per_cpu(rcu_awake_at_gp_end, cpu) = 1; return; /* Nothing to do immediately. */ } + trace_rcu_prep_idle("Begin holdoff"); invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ return; } @@ -2073,9 +2078,15 @@ static void rcu_prepare_for_idle(int cpu) c = c || per_cpu(rcu_bh_data, cpu).nxtlist; } - /* If RCU callbacks are still pending, RCU still needs this CPU. */ - if (c) + /* + * If RCU callbacks are still pending, RCU still needs this CPU. + * So try forcing the callbacks through the grace period. + */ + if (c) { + trace_rcu_prep_idle("More callbacks"); invoke_rcu_core(); + } else + trace_rcu_prep_idle("Callbacks drained"); } /* @@ -2085,6 +2096,7 @@ static void rcu_prepare_for_idle(int cpu) */ static void rcu_wake_cpu(void *unused) { + trace_rcu_prep_idle("CPU awakened at GP end"); invoke_rcu_core(); } From 3084f2f80cc8a1fd66233722d88beac0fe85e26f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 Nov 2011 17:07:11 -0800 Subject: [PATCH 44/64] rcu: Go dyntick-idle more quickly if CPU has serviced current grace period The earlier version would attempt to push callbacks through five times before going into dyntick-idle mode if callbacks remained, but the CPU had done all that it needed to do for the current RCU grace periods. This is wasteful: In most cases, once the CPU has done all that it needs to for the current RCU grace periods, it will make no further progress on the callbacks no matter how many times it loops through the RCU core processing and the idle-entry code. This commit therefore goes to dyntick-idle mode whenever the current CPU has done all it can for the current grace period. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 6467f5669ab71c..45790bfb6e8c77 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2028,12 +2028,29 @@ static void rcu_prepare_for_idle(int cpu) { int c = 0; - /* If no callbacks or in the holdoff period, enter dyntick-idle. */ + /* + * If there are no callbacks on this CPU or if RCU has no further + * need for this CPU at the moment, enter dyntick-idle mode. + * Also reset state so as to not prejudice later attempts. + */ if (!rcu_cpu_has_callbacks(cpu)) { per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; + per_cpu(rcu_dyntick_drain, cpu) = 0; trace_rcu_prep_idle("No callbacks"); return; } + if (!rcu_pending(cpu)) { + trace_rcu_prep_idle("Dyntick with callbacks"); + per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; + per_cpu(rcu_dyntick_drain, cpu) = 0; + per_cpu(rcu_awake_at_gp_end, cpu) = 1; + return; /* Nothing to do immediately. */ + } + + /* + * If in holdoff mode, just return. We will presumably have + * refrained from disabling the scheduling-clock tick. + */ if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { trace_rcu_prep_idle("In holdoff"); return; @@ -2046,11 +2063,6 @@ static void rcu_prepare_for_idle(int cpu) } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* We have hit the limit, so time to give up. */ per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; - if (!rcu_pending(cpu)) { - trace_rcu_prep_idle("Dyntick with callbacks"); - per_cpu(rcu_awake_at_gp_end, cpu) = 1; - return; /* Nothing to do immediately. */ - } trace_rcu_prep_idle("Begin holdoff"); invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ return; From 84ad00cb61f1cb21f0b63bc6f7dc254399eb3830 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 Nov 2011 17:46:19 -0800 Subject: [PATCH 45/64] rcu: Avoid needlessly IPIing CPUs at GP end If a CPU enters dyntick-idle mode with callbacks pending, it will need an IPI at the end of the grace period. However, if it exits dyntick-idle mode before the grace period ends, it will be needlessly IPIed at the end of the grace period. Therefore, this commit clears the per-CPU rcu_awake_at_gp_end flag when a CPU determines that it does not need it. This in turn requires disabling interrupts across much of rcu_prepare_for_idle() in order to avoid having nested interrupts clearing this state out from under us. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 45790bfb6e8c77..c4daf1e19e01fa 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2027,6 +2027,9 @@ int rcu_needs_cpu(int cpu) static void rcu_prepare_for_idle(int cpu) { int c = 0; + unsigned long flags; + + local_irq_save(flags); /* * If there are no callbacks on this CPU or if RCU has no further @@ -2036,14 +2039,17 @@ static void rcu_prepare_for_idle(int cpu) if (!rcu_cpu_has_callbacks(cpu)) { per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; per_cpu(rcu_dyntick_drain, cpu) = 0; + per_cpu(rcu_awake_at_gp_end, cpu) = 0; + local_irq_restore(flags); trace_rcu_prep_idle("No callbacks"); return; } if (!rcu_pending(cpu)) { - trace_rcu_prep_idle("Dyntick with callbacks"); per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; per_cpu(rcu_dyntick_drain, cpu) = 0; per_cpu(rcu_awake_at_gp_end, cpu) = 1; + local_irq_restore(flags); + trace_rcu_prep_idle("Dyntick with callbacks"); return; /* Nothing to do immediately. */ } @@ -2052,6 +2058,7 @@ static void rcu_prepare_for_idle(int cpu) * refrained from disabling the scheduling-clock tick. */ if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { + local_irq_restore(flags); trace_rcu_prep_idle("In holdoff"); return; } @@ -2060,9 +2067,11 @@ static void rcu_prepare_for_idle(int cpu) if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* First time through, initialize the counter. */ per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES; + per_cpu(rcu_awake_at_gp_end, cpu) = 0; } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* We have hit the limit, so time to give up. */ per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; + local_irq_restore(flags); trace_rcu_prep_idle("Begin holdoff"); invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ return; @@ -2095,10 +2104,13 @@ static void rcu_prepare_for_idle(int cpu) * So try forcing the callbacks through the grace period. */ if (c) { + local_irq_restore(flags); trace_rcu_prep_idle("More callbacks"); invoke_rcu_core(); - } else + } else { + local_irq_restore(flags); trace_rcu_prep_idle("Callbacks drained"); + } } /* From f535a607c13c7b674e0788ca5765779aa74a01c3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 Nov 2011 20:43:02 -0800 Subject: [PATCH 46/64] rcu: Eliminate RCU_FAST_NO_HZ grace-period hang With the new implementation of RCU_FAST_NO_HZ, it was possible to hang RCU grace periods as follows: o CPU 0 attempts to go idle, cycles several times through the rcu_prepare_for_idle() loop, then goes dyntick-idle when RCU needs nothing more from it, while still having at least on RCU callback pending. o CPU 1 goes idle with no callbacks. Both CPUs can then stay in dyntick-idle mode indefinitely, preventing the RCU grace period from ever completing, possibly hanging the system. This commit therefore prevents CPUs that have RCU callbacks from entering dyntick-idle mode. This approach also eliminates the need for the end-of-grace-period IPIs used previously. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 1 - kernel/rcutree.c | 2 - kernel/rcutree.h | 3 -- kernel/rcutree_plugin.h | 78 +------------------------------------- 4 files changed, 2 insertions(+), 82 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index debe453c96230e..8dd6fcb9494657 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -287,7 +287,6 @@ TRACE_EVENT(rcu_dyntick, * * "No callbacks": Nothing to do, no callbacks on this CPU. * "In holdoff": Nothing to do, holding off after unsuccessful attempt. - * "Dyntick with callbacks": Callbacks remain, but RCU doesn't need CPU. * "Begin holdoff": Attempt failed, don't retry until next jiffy. * "More callbacks": Still more callbacks, try again to clear them out. * "Callbacks drained": All callbacks processed, off to dyntick idle! diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 7fb8b0e608116e..13fab4a9f9fbf5 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1086,7 +1086,6 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) * callbacks are waiting on the grace period that just now * completed. */ - rcu_schedule_wake_gp_end(); if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ @@ -1672,7 +1671,6 @@ static void rcu_process_callbacks(struct softirq_action *unused) &__get_cpu_var(rcu_sched_data)); __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); rcu_preempt_process_callbacks(); - rcu_wake_cpus_for_gp_end(); trace_rcu_utilization("End RCU core"); } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index ea32405177c925..70d8a557090f44 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -88,7 +88,6 @@ struct rcu_dynticks { /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ - int wake_gp_end; /* A GP ended, need to wake up CPUs. */ }; /* RCU's kthread states for tracing. */ @@ -469,7 +468,5 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg); static void rcu_cpu_kthread_setrt(int cpu, int to_rt); static void __cpuinit rcu_prepare_kthreads(int cpu); static void rcu_prepare_for_idle(int cpu); -static void rcu_wake_cpus_for_gp_end(void); -static void rcu_schedule_wake_gp_end(void); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c4daf1e19e01fa..3d84dbc113d619 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1964,28 +1964,11 @@ static void rcu_prepare_for_idle(int cpu) { } -/* - * CPUs are never putting themselves to sleep with callbacks pending, - * so there is no need to awaken them. - */ -static void rcu_wake_cpus_for_gp_end(void) -{ -} - -/* - * CPUs are never putting themselves to sleep with callbacks pending, - * so there is no need to schedule the act of awakening them. - */ -static void rcu_schedule_wake_gp_end(void) -{ -} - #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ #define RCU_NEEDS_CPU_FLUSHES 5 static DEFINE_PER_CPU(int, rcu_dyntick_drain); static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); -static DEFINE_PER_CPU(bool, rcu_awake_at_gp_end); /* * Allow the CPU to enter dyntick-idle mode if either: (1) There are no @@ -2032,26 +2015,16 @@ static void rcu_prepare_for_idle(int cpu) local_irq_save(flags); /* - * If there are no callbacks on this CPU or if RCU has no further - * need for this CPU at the moment, enter dyntick-idle mode. - * Also reset state so as to not prejudice later attempts. + * If there are no callbacks on this CPU, enter dyntick-idle mode. + * Also reset state to avoid prejudicing later attempts. */ if (!rcu_cpu_has_callbacks(cpu)) { per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; per_cpu(rcu_dyntick_drain, cpu) = 0; - per_cpu(rcu_awake_at_gp_end, cpu) = 0; local_irq_restore(flags); trace_rcu_prep_idle("No callbacks"); return; } - if (!rcu_pending(cpu)) { - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; - per_cpu(rcu_dyntick_drain, cpu) = 0; - per_cpu(rcu_awake_at_gp_end, cpu) = 1; - local_irq_restore(flags); - trace_rcu_prep_idle("Dyntick with callbacks"); - return; /* Nothing to do immediately. */ - } /* * If in holdoff mode, just return. We will presumably have @@ -2067,7 +2040,6 @@ static void rcu_prepare_for_idle(int cpu) if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* First time through, initialize the counter. */ per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES; - per_cpu(rcu_awake_at_gp_end, cpu) = 0; } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* We have hit the limit, so time to give up. */ per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; @@ -2113,50 +2085,4 @@ static void rcu_prepare_for_idle(int cpu) } } -/* - * Wake up a CPU by invoking the RCU core. Intended for use by - * rcu_wake_cpus_for_gp_end(), which passes this function to - * smp_call_function_single(). - */ -static void rcu_wake_cpu(void *unused) -{ - trace_rcu_prep_idle("CPU awakened at GP end"); - invoke_rcu_core(); -} - -/* - * If an RCU grace period ended recently, scan the rcu_awake_at_gp_end - * per-CPU variables, and wake up any CPUs that requested a wakeup. - */ -static void rcu_wake_cpus_for_gp_end(void) -{ - int cpu; - struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); - - if (!rdtp->wake_gp_end) - return; - rdtp->wake_gp_end = 0; - for_each_online_cpu(cpu) { - if (per_cpu(rcu_awake_at_gp_end, cpu)) { - per_cpu(rcu_awake_at_gp_end, cpu) = 0; - smp_call_function_single(cpu, rcu_wake_cpu, NULL, 0); - } - } -} - -/* - * A grace period has just ended, and so we will need to awaken CPUs - * that now have work to do. But we cannot send IPIs with interrupts - * disabled, so just set a flag so that this will happen upon exit - * from RCU core processing. - */ -static void rcu_schedule_wake_gp_end(void) -{ - struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); - - rdtp->wake_gp_end = 1; -} - -/* @@@ need tracing as well. */ - #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ From 3ad0decf98d97b9039d8ed47cee287366b929cdf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 Nov 2011 21:08:13 -0800 Subject: [PATCH 47/64] rcu: Reduce latency of rcu_prepare_for_idle() Re-enable interrupts across calls to quiescent-state functions and also across force_quiescent_state() to reduce latency. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 3d84dbc113d619..42ca5a400ae393 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2009,7 +2009,6 @@ int rcu_needs_cpu(int cpu) */ static void rcu_prepare_for_idle(int cpu) { - int c = 0; unsigned long flags; local_irq_save(flags); @@ -2055,27 +2054,30 @@ static void rcu_prepare_for_idle(int cpu) */ #ifdef CONFIG_TREE_PREEMPT_RCU if (per_cpu(rcu_preempt_data, cpu).nxtlist) { + local_irq_restore(flags); rcu_preempt_qs(cpu); force_quiescent_state(&rcu_preempt_state, 0); - c = c || per_cpu(rcu_preempt_data, cpu).nxtlist; + local_irq_save(flags); } #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ if (per_cpu(rcu_sched_data, cpu).nxtlist) { + local_irq_restore(flags); rcu_sched_qs(cpu); force_quiescent_state(&rcu_sched_state, 0); - c = c || per_cpu(rcu_sched_data, cpu).nxtlist; + local_irq_save(flags); } if (per_cpu(rcu_bh_data, cpu).nxtlist) { + local_irq_restore(flags); rcu_bh_qs(cpu); force_quiescent_state(&rcu_bh_state, 0); - c = c || per_cpu(rcu_bh_data, cpu).nxtlist; + local_irq_save(flags); } /* * If RCU callbacks are still pending, RCU still needs this CPU. * So try forcing the callbacks through the grace period. */ - if (c) { + if (rcu_cpu_has_callbacks(cpu)) { local_irq_restore(flags); trace_rcu_prep_idle("More callbacks"); invoke_rcu_core(); From c92b131bdcf89bf79870f1631d07547241a98f6c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 23 Nov 2011 13:38:58 -0800 Subject: [PATCH 48/64] rcu: Remove dynticks false positives and RCU failures Assertions in rcu_init_percpu_data() unknowingly relied on outgoing CPUs being turned off before reaching the idle loop. Unfortunately, when running under kvm/qemu on x86, CPUs really can get to idle before begin shut off. These CPUs are then born in dyntick-idle mode from an RCU perspective, which results in splats in rcu_init_percpu_data() and in RCU wrongly ignoring those CPUs despite them being active. This in turn can cause RCU to end grace periods prematurely, potentially freeing up memory that the newly onlined CPUs were still using. This is most decidedly not what we need to see in an RCU implementation. This commit therefore replaces the assertions in rcu_init_percpu_data() with code that forces RCU's dyntick-idle view of newly onlined CPUs to match reality. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 13fab4a9f9fbf5..aab9ed504b17e2 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2054,8 +2054,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; - WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING); - WARN_ON_ONCE((atomic_read(&rdp->dynticks->dynticks) & 0x1) != 1); + rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING; + atomic_set(&rdp->dynticks->dynticks, + (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ /* From f0e7c19db8798b4b991a2c71911e71f5dfdb348f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 23 Nov 2011 15:02:05 -0800 Subject: [PATCH 49/64] rcu: Identify dyntick-idle CPUs on first force_quiescent_state() pass Fixes and workarounds for a number of issues (for example, that in df4012edc) make it safe to once again detect dyntick-idle CPUs on the first pass of force_quiescent_state(), so this commit makes that change. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index aab9ed504b17e2..69bb37287cc894 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -607,7 +607,7 @@ int rcu_is_cpu_rrupt_from_idle(void) static int dyntick_save_progress_counter(struct rcu_data *rdp) { rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); - return 0; + return (rdp->dynticks_snap & 0x1) == 0; } /* From 3842a0832a1d6eb0b31421f8810a813135967512 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Nov 2011 10:42:42 -0800 Subject: [PATCH 50/64] rcu: Document same-context read-side constraints The intent is that a given RCU read-side critical section be confined to a single context. For example, it is illegal to invoke rcu_read_lock() in an exception handler and then invoke rcu_read_unlock() from the context of the task that received the exception. Suggested-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 15 +++++++++++++++ include/linux/srcu.h | 5 +++++ 2 files changed, 20 insertions(+) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 5dd6fd8b3203f6..81c04f4348ecac 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -265,6 +265,11 @@ extern int debug_lockdep_rcu_enabled(void); * * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot * and while lockdep is disabled. + * + * Note that rcu_read_lock() and the matching rcu_read_unlock() must + * occur in the same context, for example, it is illegal to invoke + * rcu_read_unlock() in process context if the matching rcu_read_lock() + * was invoked from within an irq handler. */ static inline int rcu_read_lock_held(void) { @@ -689,6 +694,11 @@ static inline void rcu_read_unlock(void) * critical sections in interrupt context can use just rcu_read_lock(), * though this should at least be commented to avoid confusing people * reading the code. + * + * Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh() + * must occur in the same context, for example, it is illegal to invoke + * rcu_read_unlock_bh() from one task if the matching rcu_read_lock_bh() + * was invoked from some other task. */ static inline void rcu_read_lock_bh(void) { @@ -716,6 +726,11 @@ static inline void rcu_read_unlock_bh(void) * are being done using call_rcu_sched() or synchronize_rcu_sched(). * Read-side critical sections can also be introduced by anything that * disables preemption, including local_irq_disable() and friends. + * + * Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched() + * must occur in the same context, for example, it is illegal to invoke + * rcu_read_unlock_sched() from process context if the matching + * rcu_read_lock_sched() was invoked from an NMI handler. */ static inline void rcu_read_lock_sched(void) { diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 1eb520cd1680db..e1b005918bbb31 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -158,6 +158,11 @@ static inline int srcu_read_lock_held(struct srcu_struct *sp) * one way to indirectly wait on an SRCU grace period is to acquire * a mutex that is held elsewhere while calling synchronize_srcu() or * synchronize_srcu_expedited(). + * + * Note that srcu_read_lock() and the matching srcu_read_unlock() must + * occur in the same context, for example, it is illegal to invoke + * srcu_read_unlock() in an irq handler if the matching srcu_read_lock() + * was invoked in process context. */ static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp) { From 7cb92499000e3c86dae653077b1465458a039ef6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Nov 2011 12:28:34 -0800 Subject: [PATCH 51/64] rcu: Permit dyntick-idle with callbacks pending The current implementation of RCU_FAST_NO_HZ prevents CPUs from entering dyntick-idle state if they have RCU callbacks pending. Unfortunately, this has the side-effect of often preventing them from entering this state, especially if at least one other CPU is not in dyntick-idle state. However, the resulting per-tick wakeup is wasteful in many cases: if the CPU has already fully responded to the current RCU grace period, there will be nothing for it to do until this grace period ends, which will frequently take several jiffies. This commit therefore permits a CPU that has done everything that the current grace period has asked of it (rcu_pending() == 0) even if it still as RCU callbacks pending. However, such a CPU posts a timer to wake it up several jiffies later (6 jiffies, based on experience with grace-period lengths). This wakeup is required to handle situations that can result in all CPUs being in dyntick-idle mode, thus failing to ever complete the current grace period. If a CPU wakes up before the timer goes off, then it cancels that timer, thus avoiding spurious wakeups. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 3 +- kernel/rcutree.c | 3 ++ kernel/rcutree.h | 2 + kernel/rcutree_plugin.h | 75 ++++++++++++++++++++++++++++++++++++-- 4 files changed, 78 insertions(+), 5 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 8dd6fcb9494657..c75418c3ccb810 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -288,9 +288,10 @@ TRACE_EVENT(rcu_dyntick, * "No callbacks": Nothing to do, no callbacks on this CPU. * "In holdoff": Nothing to do, holding off after unsuccessful attempt. * "Begin holdoff": Attempt failed, don't retry until next jiffy. + * "Dyntick with callbacks": Entering dyntick-idle despite callbacks. * "More callbacks": Still more callbacks, try again to clear them out. * "Callbacks drained": All callbacks processed, off to dyntick idle! - * "CPU awakened at GP end": + * "Timer": Timer fired to cause CPU to continue processing callbacks. */ TRACE_EVENT(rcu_prep_idle, diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 69bb37287cc894..bf085d7f6a3f37 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -448,6 +448,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ smp_mb__after_atomic_inc(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + rcu_cleanup_after_idle(smp_processor_id()); trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); if (!is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); @@ -2057,6 +2058,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING; atomic_set(&rdp->dynticks->dynticks, (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); + rcu_prepare_for_idle_init(cpu); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ /* @@ -2138,6 +2140,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, rcu_send_cbs_to_online(&rcu_bh_state); rcu_send_cbs_to_online(&rcu_sched_state); rcu_preempt_send_cbs_to_online(); + rcu_cleanup_after_idle(cpu); break; case CPU_DEAD: case CPU_DEAD_FROZEN: diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 70d8a557090f44..9bcfbc9d16c645 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -467,6 +467,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg); #endif /* #ifdef CONFIG_RCU_BOOST */ static void rcu_cpu_kthread_setrt(int cpu, int to_rt); static void __cpuinit rcu_prepare_kthreads(int cpu); +static void rcu_prepare_for_idle_init(int cpu); +static void rcu_cleanup_after_idle(int cpu); static void rcu_prepare_for_idle(int cpu); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 42ca5a400ae393..dbcea6b93aea62 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1947,15 +1947,29 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited); * 1 if so. This function is part of the RCU implementation; it is -not- * an exported member of the RCU API. * - * Because we have preemptible RCU, just check whether this CPU needs - * any flavor of RCU. Do not chew up lots of CPU cycles with preemption - * disabled in a most-likely vain attempt to cause RCU not to need this CPU. + * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs + * any flavor of RCU. */ int rcu_needs_cpu(int cpu) { return rcu_cpu_has_callbacks(cpu); } +/* + * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it. + */ +static void rcu_prepare_for_idle_init(int cpu) +{ +} + +/* + * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up + * after it. + */ +static void rcu_cleanup_after_idle(int cpu) +{ +} + /* * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y, * is nothing. @@ -1966,9 +1980,12 @@ static void rcu_prepare_for_idle(int cpu) #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ -#define RCU_NEEDS_CPU_FLUSHES 5 +#define RCU_NEEDS_CPU_FLUSHES 5 /* Allow for callback self-repost. */ +#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ static DEFINE_PER_CPU(int, rcu_dyntick_drain); static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); +static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); +static ktime_t rcu_idle_gp_wait; /* * Allow the CPU to enter dyntick-idle mode if either: (1) There are no @@ -1988,6 +2005,47 @@ int rcu_needs_cpu(int cpu) return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies; } +/* + * Timer handler used to force CPU to start pushing its remaining RCU + * callbacks in the case where it entered dyntick-idle mode with callbacks + * pending. The hander doesn't really need to do anything because the + * real work is done upon re-entry to idle, or by the next scheduling-clock + * interrupt should idle not be re-entered. + */ +static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) +{ + trace_rcu_prep_idle("Timer"); + return HRTIMER_NORESTART; +} + +/* + * Initialize the timer used to pull CPUs out of dyntick-idle mode. + */ +static void rcu_prepare_for_idle_init(int cpu) +{ + static int firsttime = 1; + struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); + + hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtp->function = rcu_idle_gp_timer_func; + if (firsttime) { + unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); + + rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); + firsttime = 0; + } +} + +/* + * Clean up for exit from idle. Because we are exiting from idle, there + * is no longer any point to rcu_idle_gp_timer, so cancel it. This will + * do nothing if this timer is not active, so just cancel it unconditionally. + */ +static void rcu_cleanup_after_idle(int cpu) +{ + hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); +} + /* * Check to see if any RCU-related work can be done by the current CPU, * and if so, schedule a softirq to get it done. This function is part @@ -2040,6 +2098,15 @@ static void rcu_prepare_for_idle(int cpu) /* First time through, initialize the counter. */ per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES; } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { + /* Can we go dyntick-idle despite still having callbacks? */ + if (!rcu_pending(cpu)) { + trace_rcu_prep_idle("Dyntick with callbacks"); + per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; + hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), + rcu_idle_gp_wait, HRTIMER_MODE_REL); + return; /* Nothing more to do immediately. */ + } + /* We have hit the limit, so time to give up. */ per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; local_irq_restore(flags); From b6fc6020140db437069d5bec447858fcfd64d31c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 28 Nov 2011 16:18:56 -0800 Subject: [PATCH 52/64] rcu: Don't check irq nesting from rcu idle entry/exit Because tasks do not nest, rcu_idle_enter() and rcu_idle_exit() do not need to check for nesting. This commit therefore moves nesting checks from rcu_idle_enter_common() to rcu_irq_exit() and from rcu_idle_exit_common() to rcu_irq_enter(). Signed-off-by: Frederic Weisbecker Cc: Josh Triplett Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index bf085d7f6a3f37..860c02c7c95919 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -350,10 +350,6 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) */ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) { - if (rdtp->dynticks_nesting) { - trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); - return; - } trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); if (!is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); @@ -426,7 +422,10 @@ void rcu_irq_exit(void) oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting--; WARN_ON_ONCE(rdtp->dynticks_nesting < 0); - rcu_idle_enter_common(rdtp, oldval); + if (rdtp->dynticks_nesting) + trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); + else + rcu_idle_enter_common(rdtp, oldval); local_irq_restore(flags); } @@ -439,10 +438,6 @@ void rcu_irq_exit(void) */ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) { - if (oldval) { - trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); - return; - } smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ atomic_inc(&rdtp->dynticks); /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ @@ -518,7 +513,10 @@ void rcu_irq_enter(void) oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting++; WARN_ON_ONCE(rdtp->dynticks_nesting == 0); - rcu_idle_exit_common(rdtp, oldval); + if (oldval) + trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); + else + rcu_idle_exit_common(rdtp, oldval); local_irq_restore(flags); } From facc4e159672b4ed10aa18147bfa187b013c9505 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 28 Nov 2011 16:26:56 -0800 Subject: [PATCH 53/64] rcu: Irq nesting is always 0 on rcu_enter_idle_common Because tasks don't nest, the ->dyntick_nesting must always be zero upon entry to rcu_idle_enter_common(). Therefore, pass "0" rather than the counter itself. Signed-off-by: Frederic Weisbecker Cc: Josh Triplett Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 860c02c7c95919..c0ed3765ec3909 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -350,12 +350,11 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) */ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) { - trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); + trace_rcu_dyntick("Start", oldval, 0); if (!is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); - trace_rcu_dyntick("Error on entry: not idle task", - oldval, rdtp->dynticks_nesting); + trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); ftrace_dump(DUMP_ALL); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, From dff1672d9199fffddb58fa7970ccf59005fc35f3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Nov 2011 15:57:13 -0800 Subject: [PATCH 54/64] rcu: Keep invoking callbacks if CPU otherwise idle The rcu_do_batch() function that invokes callbacks for TREE_RCU and TREE_PREEMPT_RCU normally throttles callback invocation to avoid degrading scheduling latency. However, as long as the CPU would otherwise be idle, there is no downside to continuing to invoke any callbacks that have passed through their grace periods. In fact, processing such callbacks in a timely manner has the benefit of increasing the probability that the CPU can enter the power-saving dyntick-idle mode. Therefore, this commit allows callback invocation to continue beyond the preset limit as long as the scheduler does not have some other task to run and as long as context is that of the idle task or the relevant RCU kthread. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 5 ++++- kernel/rcutree.h | 1 + kernel/rcutree_plugin.h | 14 ++++++++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index c0ed3765ec3909..4ec4b14cfba662 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1403,7 +1403,10 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) debug_rcu_head_unqueue(list); __rcu_reclaim(rsp->name, list); list = next; - if (++count >= bl) + /* Stop only if limit reached and CPU has something to do. */ + if (++count >= bl && + (need_resched() || + (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) break; } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 9bcfbc9d16c645..fddff92d667608 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -455,6 +455,7 @@ static void __init __rcu_init_preempt(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static void invoke_rcu_callbacks_kthread(void); +static bool rcu_is_callbacks_kthread(void); #ifdef CONFIG_RCU_BOOST static void rcu_preempt_do_callbacks(void); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index dbcea6b93aea62..adb6e666c6f456 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1336,6 +1336,15 @@ static void invoke_rcu_callbacks_kthread(void) local_irq_restore(flags); } +/* + * Is the current CPU running the RCU-callbacks kthread? + * Caller must have preemption disabled. + */ +static bool rcu_is_callbacks_kthread(void) +{ + return __get_cpu_var(rcu_cpu_kthread_task) == current; +} + /* * Set the affinity of the boost kthread. The CPU-hotplug locks are * held, so no one should be messing with the existence of the boost @@ -1780,6 +1789,11 @@ static void invoke_rcu_callbacks_kthread(void) WARN_ON_ONCE(1); } +static bool rcu_is_callbacks_kthread(void) +{ + return false; +} + static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) { } From f23f7fa1c8effca19b52b98fc71016109d21db59 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Nov 2011 15:41:14 -0800 Subject: [PATCH 55/64] rcu: Adaptive dyntick-idle preparation If there are other CPUs active at a given point in time, then there is a limit to what a given CPU can do to advance the current RCU grace period. Beyond this limit, attempting to force the RCU grace period forward will do nothing but consume energy burning CPU cycles. Therefore, this commit takes an adaptive approach to RCU_FAST_NO_HZ preparations for idle. It pushes the RCU core state machine for two cycles unconditionally, and then it will push from zero to three additional cycles, but only as long as the RCU core has work for this CPU to do immediately. The rcu_pending() function is used to check whether the RCU core has such work. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 54 ++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 11 deletions(-) diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index adb6e666c6f456..8cd9efe7e81f4f 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1994,8 +1994,40 @@ static void rcu_prepare_for_idle(int cpu) #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ -#define RCU_NEEDS_CPU_FLUSHES 5 /* Allow for callback self-repost. */ +/* + * This code is invoked when a CPU goes idle, at which point we want + * to have the CPU do everything required for RCU so that it can enter + * the energy-efficient dyntick-idle mode. This is handled by a + * state machine implemented by rcu_prepare_for_idle() below. + * + * The following three proprocessor symbols control this state machine: + * + * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt + * to satisfy RCU. Beyond this point, it is better to incur a periodic + * scheduling-clock interrupt than to loop through the state machine + * at full power. + * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are + * optional if RCU does not need anything immediately from this + * CPU, even if this CPU still has RCU callbacks queued. The first + * times through the state machine are mandatory: we need to give + * the state machine a chance to communicate a quiescent state + * to the RCU core. + * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted + * to sleep in dyntick-idle mode with RCU callbacks pending. This + * is sized to be roughly one RCU grace period. Those energy-efficiency + * benchmarkers who might otherwise be tempted to set this to a large + * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your + * system. And if you are -that- concerned about energy efficiency, + * just power the system down and be done with it! + * + * The values below work well in practice. If future workloads require + * adjustment, they can be converted into kernel config parameters, though + * making the state machine smarter might be a better option. + */ +#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ +#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ + static DEFINE_PER_CPU(int, rcu_dyntick_drain); static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); @@ -2110,17 +2142,17 @@ static void rcu_prepare_for_idle(int cpu) /* Check and update the rcu_dyntick_drain sequencing. */ if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* First time through, initialize the counter. */ - per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES; - } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { + per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; + } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && + !rcu_pending(cpu)) { /* Can we go dyntick-idle despite still having callbacks? */ - if (!rcu_pending(cpu)) { - trace_rcu_prep_idle("Dyntick with callbacks"); - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; - hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), - rcu_idle_gp_wait, HRTIMER_MODE_REL); - return; /* Nothing more to do immediately. */ - } - + trace_rcu_prep_idle("Dyntick with callbacks"); + per_cpu(rcu_dyntick_drain, cpu) = 0; + per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; + hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), + rcu_idle_gp_wait, HRTIMER_MODE_REL); + return; /* Nothing more to do immediately. */ + } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* We have hit the limit, so time to give up. */ per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; local_irq_restore(flags); From 2d1dc9a600edf33321bcdc1c808b7957d8a3f3e1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Nov 2011 17:29:18 -0800 Subject: [PATCH 56/64] rcu: Remove redundant rcu_cpu_stall_suppress declaration No point in having two identical rcu_cpu_stall_suppress declarations, so remove the more obscure of the two. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 4ec4b14cfba662..2b2e1a996a65ce 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -642,8 +642,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) #endif /* #ifdef CONFIG_SMP */ -int rcu_cpu_stall_suppress __read_mostly; - static void record_gp_stall_check_time(struct rcu_state *rsp) { rsp->gp_start = jiffies; From 2987557f52b97f679f0c324d8f51b8d66e1f2084 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sat, 3 Dec 2011 13:06:50 -0800 Subject: [PATCH 57/64] driver-core/cpu: Expose hotpluggability to the rest of the kernel When architectures register CPUs, they indicate whether the CPU allows hotplugging; notably, x86 and ARM don't allow hotplugging CPU 0. Userspace can easily query the hotpluggability of a CPU via sysfs; however, the kernel has no convenient way of accessing that property in an architecture-independent way. While the kernel can simply try it and see, some code needs to distinguish between "hotplug failed" and "hotplug has no hope of working on this CPU"; for example, rcutorture's CPU hotplug tests want to avoid drowning out real hotplug failures with expected failures. Expose this property via a new cpu_is_hotpluggable function, so that the rest of the kernel can access it in an architecture-independent way. Signed-off-by: Josh Triplett Signed-off-by: Paul E. McKenney --- drivers/base/cpu.c | 7 +++++++ include/linux/cpu.h | 1 + 2 files changed, 8 insertions(+) diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 251acea3d35986..3991502b21e551 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -247,6 +247,13 @@ struct sys_device *get_cpu_sysdev(unsigned cpu) } EXPORT_SYMBOL_GPL(get_cpu_sysdev); +bool cpu_is_hotpluggable(unsigned cpu) +{ + struct sys_device *dev = get_cpu_sysdev(cpu); + return dev && container_of(dev, struct cpu, sysdev)->hotpluggable; +} +EXPORT_SYMBOL_GPL(cpu_is_hotpluggable); + int __init cpu_dev_init(void) { int err; diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 6cb60fd2ea8454..305c263021e7e6 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -27,6 +27,7 @@ struct cpu { extern int register_cpu(struct cpu *cpu, int num); extern struct sys_device *get_cpu_sysdev(unsigned cpu); +extern bool cpu_is_hotpluggable(unsigned cpu); extern int cpu_add_sysdev_attr(struct sysdev_attribute *attr); extern void cpu_remove_sysdev_attr(struct sysdev_attribute *attr); From f220242af98a5248209426f36d93226c3e0f2391 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 3 Dec 2011 15:09:28 -0800 Subject: [PATCH 58/64] rcu: Make rcutorture test for hotpluggability before offlining CPUs The rcutorture test now can automatically exercise CPU hotplug and collect success statistics, which can be correlated with other rcutorture activity. This permits rcutorture to completely exercise RCU regardless of what sort of userspace and filesystem layout is in use. Unfortunately, rcutorture is happy to attempt to offline CPUs that cannot be offlined, for example, CPU 0 in both the x86 and ARM architectures. Although this allows rcutorture testing to proceed normally, it confounds attempts at error analysis due to the resulting flood of spurious CPU-hotplug errors. Therefore, this commit uses the new cpu_is_hotpluggable() function to avoid attempting to offline CPUs that are not hotpluggable, which in turn avoids spurious CPU-hotplug errors. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 1e422ae1506b3a..186ead9fa34883 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -1388,7 +1388,7 @@ rcu_torture_onoff(void *arg) WARN_ON(maxcpu < 0); while (!kthread_should_stop()) { cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); - if (cpu_online(cpu)) { + if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "rcu_torture_onoff task: offlining %d\n", @@ -1402,7 +1402,7 @@ rcu_torture_onoff(void *arg) torture_type, cpu); n_offline_successes++; } - } else { + } else if (cpu_is_hotpluggable(cpu)) { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "rcu_torture_onoff task: onlining %d\n", From 101db7b41d8d6c070278bca1f7bce814ecbf781d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 5 Dec 2011 15:36:31 -0800 Subject: [PATCH 59/64] rcu: Add rcutorture tests for srcu_read_lock_raw() This commit adds simple rcutorture tests for srcu_read_lock_raw() and srcu_read_unlock_raw(). It does not test doing srcu_read_lock_raw() in an exception handler and releasing it in the corresponding process context. Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 186ead9fa34883..88f17b8a3b1dac 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -632,6 +632,30 @@ static struct rcu_torture_ops srcu_ops = { .name = "srcu" }; +static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) +{ + return srcu_read_lock_raw(&srcu_ctl); +} + +static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl) +{ + srcu_read_unlock_raw(&srcu_ctl, idx); +} + +static struct rcu_torture_ops srcu_raw_ops = { + .init = srcu_torture_init, + .cleanup = srcu_torture_cleanup, + .readlock = srcu_torture_read_lock_raw, + .read_delay = srcu_read_delay, + .readunlock = srcu_torture_read_unlock_raw, + .completed = srcu_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = srcu_torture_synchronize, + .cb_barrier = NULL, + .stats = srcu_torture_stats, + .name = "srcu_raw" +}; + static void srcu_torture_synchronize_expedited(void) { synchronize_srcu_expedited(&srcu_ctl); @@ -1591,7 +1615,7 @@ rcu_torture_init(void) static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, - &srcu_ops, &srcu_expedited_ops, + &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops, &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; mutex_lock(&fullstop_mutex); From 4968c300e1fa5389fdf1f1ebd8b8e4aec9aa4a9e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Dec 2011 16:32:40 -0800 Subject: [PATCH 60/64] rcu: Augment rcu_batch_end tracing for idle and callback state The current rcu_batch_end event trace records only the name of the RCU flavor and the total number of callbacks that remain queued on the current CPU. This is insufficient for testing and tuning the new dyntick-idle RCU_FAST_NO_HZ code, so this commit adds idle state along with whether or not any of the callbacks that were ready to invoke at the beginning of rcu_do_batch() are still queued. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 38 +++++++++++++++++++++++++++++--------- kernel/rcutiny.c | 10 ++++++++-- kernel/rcutiny_plugin.h | 25 +++++++++++++++++++++++++ kernel/rcutree.c | 8 ++++++-- 4 files changed, 68 insertions(+), 13 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index c75418c3ccb810..d2d88bed891b14 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -461,27 +461,46 @@ TRACE_EVENT(rcu_invoke_kfree_callback, /* * Tracepoint for exiting rcu_do_batch after RCU callbacks have been - * invoked. The first argument is the name of the RCU flavor and - * the second argument is number of callbacks actually invoked. + * invoked. The first argument is the name of the RCU flavor, + * the second argument is number of callbacks actually invoked, + * the third argument (cb) is whether or not any of the callbacks that + * were ready to invoke at the beginning of this batch are still + * queued, the fourth argument (nr) is the return value of need_resched(), + * the fifth argument (iit) is 1 if the current task is the idle task, + * and the sixth argument (risk) is the return value from + * rcu_is_callbacks_kthread(). */ TRACE_EVENT(rcu_batch_end, - TP_PROTO(char *rcuname, int callbacks_invoked), + TP_PROTO(char *rcuname, int callbacks_invoked, + bool cb, bool nr, bool iit, bool risk), - TP_ARGS(rcuname, callbacks_invoked), + TP_ARGS(rcuname, callbacks_invoked, cb, nr, iit, risk), TP_STRUCT__entry( __field(char *, rcuname) __field(int, callbacks_invoked) + __field(bool, cb) + __field(bool, nr) + __field(bool, iit) + __field(bool, risk) ), TP_fast_assign( __entry->rcuname = rcuname; __entry->callbacks_invoked = callbacks_invoked; - ), - - TP_printk("%s CBs-invoked=%d", - __entry->rcuname, __entry->callbacks_invoked) + __entry->cb = cb; + __entry->nr = nr; + __entry->iit = iit; + __entry->risk = risk; + ), + + TP_printk("%s CBs-invoked=%d idle=%c%c%c%c", + __entry->rcuname, __entry->callbacks_invoked, + __entry->cb ? 'C' : '.', + __entry->nr ? 'S' : '.', + __entry->iit ? 'I' : '.', + __entry->risk ? 'R' : '.') ); /* @@ -524,7 +543,8 @@ TRACE_EVENT(rcu_torture_read, #define trace_rcu_batch_start(rcuname, qlen, blimit) do { } while (0) #define trace_rcu_invoke_callback(rcuname, rhp) do { } while (0) #define trace_rcu_invoke_kfree_callback(rcuname, rhp, offset) do { } while (0) -#define trace_rcu_batch_end(rcuname, callbacks_invoked) do { } while (0) +#define trace_rcu_batch_end(rcuname, callbacks_invoked, cb, nr, iit, risk) \ + do { } while (0) #define trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) #endif /* #else #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index e5bd94954fa3f7..977296dca0a411 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -259,7 +259,11 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) /* If no RCU callbacks ready to invoke, just return. */ if (&rcp->rcucblist == rcp->donetail) { RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); - RCU_TRACE(trace_rcu_batch_end(rcp->name, 0)); + RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, + ACCESS_ONCE(rcp->rcucblist), + need_resched(), + is_idle_task(current), + rcu_is_callbacks_kthread())); return; } @@ -288,7 +292,9 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) RCU_TRACE(cb_count++); } RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); - RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count)); + RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), + is_idle_task(current), + rcu_is_callbacks_kthread())); } static void rcu_process_callbacks(struct softirq_action *unused) diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 2b0484a5dc285b..dfa97cbb39106e 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -885,6 +885,19 @@ static void invoke_rcu_callbacks(void) wake_up(&rcu_kthread_wq); } +#ifdef CONFIG_RCU_TRACE + +/* + * Is the current CPU running the RCU-callbacks kthread? + * Caller must have preemption disabled. + */ +static bool rcu_is_callbacks_kthread(void) +{ + return rcu_kthread_task == current; +} + +#endif /* #ifdef CONFIG_RCU_TRACE */ + /* * This kthread invokes RCU callbacks whose grace periods have * elapsed. It is awakened as needed, and takes the place of the @@ -938,6 +951,18 @@ void invoke_rcu_callbacks(void) raise_softirq(RCU_SOFTIRQ); } +#ifdef CONFIG_RCU_TRACE + +/* + * There is no callback kthread, so this thread is never it. + */ +static bool rcu_is_callbacks_kthread(void) +{ + return false; +} + +#endif /* #ifdef CONFIG_RCU_TRACE */ + void rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 2b2e1a996a65ce..6c4a6722abfd2f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1373,7 +1373,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* If no callbacks are ready, just return.*/ if (!cpu_has_callbacks_ready_to_invoke(rdp)) { trace_rcu_batch_start(rsp->name, 0, 0); - trace_rcu_batch_end(rsp->name, 0); + trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), + need_resched(), is_idle_task(current), + rcu_is_callbacks_kthread()); return; } @@ -1409,7 +1411,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) } local_irq_save(flags); - trace_rcu_batch_end(rsp->name, count); + trace_rcu_batch_end(rsp->name, count, !!list, need_resched(), + is_idle_task(current), + rcu_is_callbacks_kthread()); /* Update count, and requeue any remaining callbacks. */ rdp->qlen -= count; From d493011a376f9df3b8b3da1102509b343b1a4ef2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 7 Dec 2011 15:11:23 -0800 Subject: [PATCH 61/64] docs: Additional LWN links to RCU API Tyler Hicks pointed me at an additional article on RCU and I figured it should probably be mentioned with the others. Signed-off-by: Kees Cook Signed-off-by: Paul E. McKenney --- Documentation/RCU/whatisRCU.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 8e8cdc2430b9ba..6bbe8dcdc3da94 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -4,6 +4,7 @@ to start learning about RCU: 1. What is RCU, Fundamentally? http://lwn.net/Articles/262464/ 2. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/ 3. RCU part 3: the RCU API http://lwn.net/Articles/264090/ +4. The RCU API, 2010 Edition http://lwn.net/Articles/418853/ What is RCU? From 70321d447aa1a7cc2d60db16234f43c5a65630e7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 9 Dec 2011 14:00:06 -0800 Subject: [PATCH 62/64] Revert "rcu: Permit rt_mutex_unlock() with irqs disabled" This reverts commit 5342e269b2b58ee0b0b4168a94087faaa60d0567. The approach taken in this patch was deemed too abusive to mutexes, and thus too likely to result in maintenance problems in the future. Instead, we will disallow RCU read-side critical sections that partially overlap with interrupt-disbled code segments. Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 5 ----- kernel/rtmutex.c | 8 -------- 2 files changed, 13 deletions(-) diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 8cd9efe7e81f4f..f55f10ba01f548 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1165,8 +1165,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp) #endif /* #else #ifdef CONFIG_RCU_TRACE */ -static struct lock_class_key rcu_boost_class; - /* * Carry out RCU priority boosting on the task indicated by ->exp_tasks * or ->boost_tasks, advancing the pointer to the next task in the @@ -1229,9 +1227,6 @@ static int rcu_boost(struct rcu_node *rnp) */ t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&mtx, t); - /* Avoid lockdep false positives. This rt_mutex is its own thing. */ - lockdep_set_class_and_name(&mtx.wait_lock, &rcu_boost_class, - "rcu_boost_mutex"); t->rcu_boost_mutex = &mtx; raw_spin_unlock_irqrestore(&rnp->lock, flags); rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index f9d8482dd4872f..a242e691c99303 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -579,7 +579,6 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, struct rt_mutex_waiter *waiter) { int ret = 0; - int was_disabled; for (;;) { /* Try to acquire the lock: */ @@ -602,17 +601,10 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, raw_spin_unlock(&lock->wait_lock); - was_disabled = irqs_disabled(); - if (was_disabled) - local_irq_enable(); - debug_rt_mutex_print_deadlock(waiter); schedule_rt_mutex(lock); - if (was_disabled) - local_irq_disable(); - raw_spin_lock(&lock->wait_lock); set_current_state(state); } From 4f89b336fd1edf0c88875d0b7fcdc288c7de903d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 9 Dec 2011 14:43:47 -0800 Subject: [PATCH 63/64] rcu: Apply ACCESS_ONCE() to rcu_boost() return value Both TINY_RCU's and TREE_RCU's implementations of rcu_boost() access the ->boost_tasks and ->exp_tasks fields without preventing concurrent changes to these fields. This commit therefore applies ACCESS_ONCE in order to prevent compiler mischief. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutiny_plugin.h | 4 ++-- kernel/rcutree_plugin.h | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index dfa97cbb39106e..9cb1ae4aabdd8b 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -312,8 +312,8 @@ static int rcu_boost(void) rt_mutex_lock(&mtx); rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ - return rcu_preempt_ctrlblk.boost_tasks != NULL || - rcu_preempt_ctrlblk.exp_tasks != NULL; + return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL || + ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL; } /* diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index f55f10ba01f548..8bb35d73e1f9d2 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1232,7 +1232,8 @@ static int rcu_boost(struct rcu_node *rnp) rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ - return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL; + return ACCESS_ONCE(rnp->exp_tasks) != NULL || + ACCESS_ONCE(rnp->boost_tasks) != NULL; } /* From a513f6bab0939800dcf1e7c075e733420cf967c5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 11 Dec 2011 21:54:45 -0800 Subject: [PATCH 64/64] cpu: Export cpu_up() Building rcutorture as a module requires cpu_up() as well as cpu_down() exported, so apply EXPORT_SYMBOL_GPL(). Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/cpu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/cpu.c b/kernel/cpu.c index 563f136094704f..9d448ddb224745 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -380,6 +380,7 @@ int __cpuinit cpu_up(unsigned int cpu) cpu_maps_update_done(); return err; } +EXPORT_SYMBOL_GPL(cpu_up); #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus;