From 62da1921292ef789c23a7bf01d671d7572baf377 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 20 Sep 2012 16:02:49 -0700 Subject: [PATCH 01/49] rcu: Accelerate callbacks for CPU initiating a grace period Because grace-period initialization is carried out by a separate kthread, it might happen on a different CPU than the one that had the callback needing a grace period -- which is where the callback acceleration needs to happen. Fortunately, rcu_start_gp() holds the root rcu_node structure's ->lock, which prevents a new grace period from starting. This allows this function to safely determine that a grace period has not yet started, which in turn allows it to fully accelerate any callbacks that it has pending. This commit adds this acceleration. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 74df86bd9204ae..93d6871bf7f960 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1404,15 +1404,37 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) !cpu_needs_another_gp(rsp, rdp)) { /* * Either we have not yet spawned the grace-period - * task or this CPU does not need another grace period. + * task, this CPU does not need another grace period, + * or a grace period is already in progress. * Either way, don't start a new grace period. */ raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } + /* + * Because there is no grace period in progress right now, + * any callbacks we have up to this point will be satisfied + * by the next grace period. So promote all callbacks to be + * handled after the end of the next grace period. If the + * CPU is not yet aware of the end of the previous grace period, + * we need to allow for the callback advancement that will + * occur when it does become aware. Deadlock prevents us from + * making it aware at this point: We cannot acquire a leaf + * rcu_node ->lock while holding the root rcu_node ->lock. + */ + rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + if (rdp->completed == rsp->completed) + rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + rsp->gp_flags = RCU_GP_FLAG_INIT; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ + + /* Ensure that CPU is aware of completion of last grace period. */ + rcu_process_gp_end(rsp, rdp); + local_irq_restore(flags); + + /* Wake up rcu_gp_kthread() to start the grace period. */ wake_up(&rsp->gp_wq); } From f191eec58803e1e16c3421638cdcc9195c425851 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Oct 2012 10:28:30 -0700 Subject: [PATCH 02/49] Documentation: Fix memory-barriers.txt example This commit fixes a broken example of overlapping stores in the Documentation/memory-barriers.txt file. Reported-by: Nikunj A Dadhania Signed-off-by: Paul E. McKenney --- Documentation/memory-barriers.txt | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 2759f7c188f0cd..3c4e1b3b80a1f8 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -251,12 +251,13 @@ And there are a number of things that _must_ or _must_not_ be assumed: And for: - *A = X; Y = *A; + *A = X; *(A + 4) = Y; - we may get either of: + we may get any of: - STORE *A = X; Y = LOAD *A; - STORE *A = Y = X; + STORE *A = X; STORE *(A + 4) = Y; + STORE *(A + 4) = Y; STORE *A = X; + STORE {*A, *(A + 4) } = {X, Y}; ========================= From 0f9574d83234274c4d8f8f742b45022e5718b16e Mon Sep 17 00:00:00 2001 From: Dhaval Giani Date: Wed, 17 Oct 2012 14:46:13 -0700 Subject: [PATCH 03/49] rcu: Correct the name of a reference in list of RCU papers Trying to go through the history of RCU (not for the weak minded) led me to search for a non-existent paper. Correct it to the actual reference Signed-off-by: Dhaval Giani Cc: Peter Zijlstra Signed-off-by: Paul E. McKenney --- Documentation/RCU/RTFP.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt index 7c1dfb19fc40a9..7f40c72a9c5150 100644 --- a/Documentation/RCU/RTFP.txt +++ b/Documentation/RCU/RTFP.txt @@ -186,7 +186,7 @@ Bibtex Entries @article{Kung80 ,author="H. T. Kung and Q. Lehman" -,title="Concurrent Maintenance of Binary Search Trees" +,title="Concurrent Manipulation of Binary Search Trees" ,Year="1980" ,Month="September" ,journal="ACM Transactions on Database Systems" From abfd6e58aed4f89fd69b9b17bc4b4527efe3a645 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 20 Sep 2012 16:59:47 -0700 Subject: [PATCH 04/49] rcu: Fix comment about _rcu_barrier()/orphanage exclusion In the old days, _rcu_barrier() acquired ->onofflock to exclude rcu_send_cbs_to_orphanage(), which allowed the latter to avoid memory barriers in callback handling. However, _rcu_barrier() recently started doing get_online_cpus() to lock out CPU-hotplug operations entirely, which means that the comment in rcu_send_cbs_to_orphanage() that talks about ->onofflock is now obsolete. This commit therefore fixes the comment. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 74df86bd9204ae..ac8aed8ee4175a 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1581,8 +1581,8 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, { /* * Orphan the callbacks. First adjust the counts. This is safe - * because ->onofflock excludes _rcu_barrier()'s adoption of - * the callbacks, thus no memory barrier is required. + * because _rcu_barrier() excludes CPU-hotplug operations, so it + * cannot be running now. Thus no memory barrier is required. */ if (rdp->nxtlist != NULL) { rsp->qlen_lazy += rdp->qlen_lazy; From 489832609a1ad7189d11715d8cefb457d90182c5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 24 Sep 2012 16:08:31 -0700 Subject: [PATCH 05/49] rcu: Make rcutorture give diagnostics if CPU offline fails This commit causes rcutorture to print the errno if cpu_down() fails when the rcutorture "verbose" module parameter is specified. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index aaa7b9f3532a86..9900f560f1bdaf 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -1502,6 +1502,7 @@ rcu_torture_onoff(void *arg) unsigned long delta; int maxcpu = -1; DEFINE_RCU_RANDOM(rand); + int ret; unsigned long starttime; VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); @@ -1522,7 +1523,13 @@ rcu_torture_onoff(void *arg) torture_type, cpu); starttime = jiffies; n_offline_attempts++; - if (cpu_down(cpu) == 0) { + ret = cpu_down(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: offline %d failed: errno %d\n", + torture_type, cpu, ret); + } else { if (verbose) pr_alert("%s" TORTURE_FLAG "rcu_torture_onoff task: offlined %d\n", From 4d9a5d4319e22670ec6d6227e12b54f361c46d0f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 11 Oct 2012 01:47:16 +0200 Subject: [PATCH 06/49] rcu: Remove rcu_switch() It's only there to call rcu_user_hooks_switch(). Let's just call rcu_user_hooks_switch() directly, we don't need this function in the middle. Signed-off-by: Frederic Weisbecker Cc: Josh Triplett Cc: Peter Zijlstra Cc: Richard Weinberger Signed-off-by: Paul E. McKenney --- arch/um/drivers/mconsole_kern.c | 2 +- include/linux/rcupdate.h | 2 ++ include/linux/sched.h | 8 -------- kernel/sched/core.c | 2 +- 4 files changed, 4 insertions(+), 10 deletions(-) diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 79ccfe6c707870..49e3b49e552f7f 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -648,7 +648,7 @@ static void stack_proc(void *arg) struct task_struct *from = current, *to = arg; to->thread.saved_task = from; - rcu_switch(from, to); + rcu_user_hooks_switch(from, to); switch_to(from, to, from); } diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 7c968e4f929ea4..5d009def7c0cbb 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -204,6 +204,8 @@ static inline void rcu_user_enter(void) { } static inline void rcu_user_exit(void) { } static inline void rcu_user_enter_after_irq(void) { } static inline void rcu_user_exit_after_irq(void) { } +static inline void rcu_user_hooks_switch(struct task_struct *prev, + struct task_struct *next) { } #endif /* CONFIG_RCU_USER_QS */ extern void exit_rcu(void); diff --git a/include/linux/sched.h b/include/linux/sched.h index 0dd42a02df2e85..432cc5e1bbeede 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1844,14 +1844,6 @@ static inline void rcu_copy_process(struct task_struct *p) #endif -static inline void rcu_switch(struct task_struct *prev, - struct task_struct *next) -{ -#ifdef CONFIG_RCU_USER_QS - rcu_user_hooks_switch(prev, next); -#endif -} - static inline void tsk_restore_flags(struct task_struct *task, unsigned long orig_flags, unsigned long flags) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d8927fda712f5..68414fa4cd2a63 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1887,7 +1887,7 @@ context_switch(struct rq *rq, struct task_struct *prev, #endif /* Here we just switch the register state and the stack. */ - rcu_switch(prev, next); + rcu_user_hooks_switch(prev, next); switch_to(prev, next, prev); barrier(); From ba49df4767d4fa5bbd2af3a51709fb81f94264ec Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 7 Oct 2012 09:26:13 -0700 Subject: [PATCH 07/49] rcu: Update RCU_FAST_NO_HZ help text The RCU_FAST_NO_HZ help text included a warning about overhead on large systems, but that issue has since been resolved. The main remaining issue with RCU_FAST_NO_HZ is increased real-time latency. This commit therefore updates the help text accordingly. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- init/Kconfig | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 6fdd6e339326a0..b63e7982c1c67e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -582,14 +582,13 @@ config RCU_FAST_NO_HZ depends on NO_HZ && SMP default n help - This option causes RCU to attempt to accelerate grace periods - in order to allow CPUs to enter dynticks-idle state more - quickly. On the other hand, this option increases the overhead - of the dynticks-idle checking, particularly on systems with - large numbers of CPUs. - - Say Y if energy efficiency is critically important, particularly - if you have relatively few CPUs. + This option causes RCU to attempt to accelerate grace periods in + order to allow CPUs to enter dynticks-idle state more quickly. + On the other hand, this option increases the overhead of the + dynticks-idle checking, thus degrading scheduling latency. + + Say Y if energy efficiency is critically important, and you don't + care about real-time response. Say N if you are unsure. From 3705b88db0d7cc4a097c32d9e554054103d3f807 Mon Sep 17 00:00:00 2001 From: Antti P Miettinen Date: Fri, 5 Oct 2012 09:59:15 +0300 Subject: [PATCH 08/49] rcu: Add a module parameter to force use of expedited RCU primitives There have been some embedded applications that would benefit from use of expedited grace-period primitives. In some ways, this is similar to synchronize_net() doing either a normal or an expedited grace period depending on lock state, but with control outside of the kernel. This commit therefore adds rcu_expedited boot and sysfs parameters that cause the kernel to substitute expedited primitives for the normal grace-period primitives. [ paulmck: Add trace/event/rcu.h to kernel/srcu.c to avoid build error. Get rid of infinite loop through contention path.] Signed-off-by: Antti P Miettinen Signed-off-by: Paul E. McKenney --- kernel/ksysfs.c | 18 ++++++++++++++++++ kernel/rcu.h | 2 ++ kernel/rcupdate.c | 3 +++ kernel/rcutiny_plugin.h | 5 ++++- kernel/rcutree.c | 12 +++++++++--- kernel/rcutree_plugin.h | 7 +++++-- kernel/srcu.c | 8 +++++++- 7 files changed, 48 insertions(+), 7 deletions(-) diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 4e316e1acf584d..8715a798aa7ce1 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -141,6 +141,23 @@ static ssize_t fscaps_show(struct kobject *kobj, } KERNEL_ATTR_RO(fscaps); +int rcu_expedited; +static ssize_t rcu_expedited_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", rcu_expedited); +} +static ssize_t rcu_expedited_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (kstrtoint(buf, 0, &rcu_expedited)) + return -EINVAL; + + return count; +} +KERNEL_ATTR_RW(rcu_expedited); + /* * Make /sys/kernel/notes give the raw contents of our kernel .notes section. */ @@ -182,6 +199,7 @@ static struct attribute * kernel_attrs[] = { &kexec_crash_size_attr.attr, &vmcoreinfo_attr.attr, #endif + &rcu_expedited_attr.attr, NULL }; diff --git a/kernel/rcu.h b/kernel/rcu.h index 8ba99cdc651556..20dfba576c2b7e 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h @@ -109,4 +109,6 @@ static inline bool __rcu_reclaim(char *rn, struct rcu_head *head) } } +extern int rcu_expedited; + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 29ca1c6da59496..a2cf76177b443d 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -46,12 +46,15 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include #include "rcu.h" +module_param(rcu_expedited, int, 0); + #ifdef CONFIG_PREEMPT_RCU /* diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 3d019028220430..f85016a2309b94 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -706,7 +706,10 @@ void synchronize_rcu(void) return; /* Once we get past the fastpath checks, same code as rcu_barrier(). */ - rcu_barrier(); + if (rcu_expedited) + synchronize_rcu_expedited(); + else + rcu_barrier(); } EXPORT_SYMBOL_GPL(synchronize_rcu); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 74df86bd9204ae..f9c17c39953875 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2224,7 +2224,10 @@ void synchronize_sched(void) "Illegal synchronize_sched() in RCU-sched read-side critical section"); if (rcu_blocking_is_gp()) return; - wait_rcu_gp(call_rcu_sched); + if (rcu_expedited) + synchronize_sched_expedited(); + else + wait_rcu_gp(call_rcu_sched); } EXPORT_SYMBOL_GPL(synchronize_sched); @@ -2245,7 +2248,10 @@ void synchronize_rcu_bh(void) "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); if (rcu_blocking_is_gp()) return; - wait_rcu_gp(call_rcu_bh); + if (rcu_expedited) + synchronize_rcu_bh_expedited(); + else + wait_rcu_gp(call_rcu_bh); } EXPORT_SYMBOL_GPL(synchronize_rcu_bh); @@ -2328,7 +2334,7 @@ void synchronize_sched_expedited(void) if (trycount++ < 10) { udelay(trycount * num_online_cpus()); } else { - synchronize_sched(); + wait_rcu_gp(call_rcu_sched); return; } diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index f921154881870b..c177ba0cce9a3f 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -679,7 +679,10 @@ void synchronize_rcu(void) "Illegal synchronize_rcu() in RCU read-side critical section"); if (!rcu_scheduler_active) return; - wait_rcu_gp(call_rcu); + if (rcu_expedited) + synchronize_rcu_expedited(); + else + wait_rcu_gp(call_rcu); } EXPORT_SYMBOL_GPL(synchronize_rcu); @@ -831,7 +834,7 @@ void synchronize_rcu_expedited(void) udelay(trycount * num_online_cpus()); } else { put_online_cpus(); - synchronize_rcu(); + wait_rcu_gp(call_rcu); return; } } diff --git a/kernel/srcu.c b/kernel/srcu.c index 97c465ebd8444c..de9074047c92be 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -34,6 +34,10 @@ #include #include +#include + +#include "rcu.h" + /* * Initialize an rcu_batch structure to empty. */ @@ -464,7 +468,9 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) */ void synchronize_srcu(struct srcu_struct *sp) { - __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT); + __synchronize_srcu(sp, rcu_expedited + ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT + : SYNCHRONIZE_SRCU_TRYCOUNT); } EXPORT_SYMBOL_GPL(synchronize_srcu); From 340f588bbaed6cb518aa65e7a330dcc3fff911f8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 15 Oct 2012 08:24:54 -0700 Subject: [PATCH 09/49] rcu: Fix precedence error in cpu_needs_another_gp() The fix introduced by a10d206e (rcu: Fix day-one dyntick-idle stall-warning bug) has a C-language precedence error. It turns out that this error is harmless in that the same result is computed for all inputs, but the code is nevertheless a potential source of confusion. This commit therefore introduces parentheses in order to force the execution of the code to reflect the intent. Reported-by: Ben Hutchings Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f9c17c39953875..effd47a54b361f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -313,7 +313,7 @@ static int cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { return *rdp->nxttail[RCU_DONE_TAIL + - ACCESS_ONCE(rsp->completed) != rdp->completed] && + (ACCESS_ONCE(rsp->completed) != rdp->completed)] && !rcu_gp_in_progress(rsp); } From 4e87b2d7e887df3fe251dd7f702591a3acf369ca Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sat, 13 Oct 2012 01:14:14 +0800 Subject: [PATCH 10/49] srcu: Credit Lai Jiangshan with SRCU rewrite Lai Jiangshan rewrote SRCU, so this commit ensures that he gets his proper share of blame^Wcredit. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 2 ++ kernel/srcu.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 55a5c52cbb25a7..a55ddb19053c83 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -16,8 +16,10 @@ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * * Copyright (C) IBM Corporation, 2006 + * Copyright (C) Fujitsu, 2012 * * Author: Paul McKenney + * Lai Jiangshan * * For detailed explanation of Read-Copy Update mechanism see - * Documentation/RCU/ *.txt diff --git a/kernel/srcu.c b/kernel/srcu.c index 97c465ebd8444c..0b99f27fa2f799 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -16,8 +16,10 @@ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * * Copyright (C) IBM Corporation, 2006 + * Copyright (C) Fujitsu, 2012 * * Author: Paul McKenney + * Lai Jiangshan * * For detailed explanation of Read-Copy Update mechanism see - * Documentation/RCU/ *.txt From f2ebfbc991044fd5b89d4529741d7500feb37fbd Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sat, 13 Oct 2012 01:14:15 +0800 Subject: [PATCH 11/49] srcu: Export process_srcu() Because process_srcu() will be used in DEFINE_SRCU(), which is a macro that could be expanded pretty much anywhere, it can no longer be static. Note that process_srcu() is still internal to srcu.h. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 2 ++ kernel/srcu.c | 6 ++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index a55ddb19053c83..5cce128f196c02 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -78,6 +78,8 @@ int init_srcu_struct(struct srcu_struct *sp); #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +void process_srcu(struct work_struct *work); + /** * call_srcu() - Queue a callback for invocation after an SRCU grace period * @sp: srcu_struct in queue the callback diff --git a/kernel/srcu.c b/kernel/srcu.c index 0b99f27fa2f799..b363b0924561c6 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -94,9 +94,6 @@ static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from) } } -/* single-thread state-machine */ -static void process_srcu(struct work_struct *work); - static int init_srcu_struct_fields(struct srcu_struct *sp) { sp->completed = 0; @@ -639,7 +636,7 @@ static void srcu_reschedule(struct srcu_struct *sp) /* * This is the work-queue function that handles SRCU grace periods. */ -static void process_srcu(struct work_struct *work) +void process_srcu(struct work_struct *work) { struct srcu_struct *sp; @@ -650,3 +647,4 @@ static void process_srcu(struct work_struct *work) srcu_invoke_callbacks(sp); srcu_reschedule(sp); } +EXPORT_SYMBOL_GPL(process_srcu); From b637a328bd4f43a0e146d1eef0142b650ba0d644 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 19 Sep 2012 16:58:38 -0700 Subject: [PATCH 12/49] rcu: Print remote CPU's stacks in stall warnings The RCU CPU stall warnings rely on trigger_all_cpu_backtrace() to do NMI-based dump of the stack traces of all CPUs. Unfortunately, a number of architectures do not implement trigger_all_cpu_backtrace(), in which case RCU falls back to just dumping the stack of the running CPU. This is unhelpful in the case where the running CPU has detected that some other CPU has stalled. This commit therefore makes the running CPU dump the stacks of the tasks running on the stalled CPUs. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/linux/sched.h | 2 ++ kernel/rcutree.c | 25 ++++++++++++++++++++++++- kernel/sched/core.c | 6 ++++++ 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 0dd42a02df2e85..ba69b5adea30f4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -109,6 +109,8 @@ extern void update_cpu_load_nohz(void); extern unsigned long get_parent_ip(unsigned long addr); +extern void dump_cpu_task(int cpu); + struct seq_file; struct cfs_rq; struct task_group; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 74df86bd9204ae..e78538712df024 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -873,6 +873,29 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); } +/* + * Dump stacks of all tasks running on stalled CPUs. This is a fallback + * for architectures that do not implement trigger_all_cpu_backtrace(). + * The NMI-triggered stack traces are more accurate because they are + * printed by the target CPU. + */ +static void rcu_dump_cpu_stacks(struct rcu_state *rsp) +{ + int cpu; + unsigned long flags; + struct rcu_node *rnp; + + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave(&rnp->lock, flags); + if (rnp->qsmask != 0) { + for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) + if (rnp->qsmask & (1UL << cpu)) + dump_cpu_task(rnp->grplo + cpu); + } + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } +} + static void print_other_cpu_stall(struct rcu_state *rsp) { int cpu; @@ -929,7 +952,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) if (ndetected == 0) printk(KERN_ERR "INFO: Stall ended before state dump start\n"); else if (!trigger_all_cpu_backtrace()) - dump_stack(); + rcu_dump_cpu_stacks(rsp); /* Complain about tasks blocking the grace period. */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d8927fda712f5..59d08fb1a9e3f7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8076,3 +8076,9 @@ struct cgroup_subsys cpuacct_subsys = { .base_cftypes = files, }; #endif /* CONFIG_CGROUP_CPUACCT */ + +void dump_cpu_task(int cpu) +{ + pr_info("Task dump for CPU %d:\n", cpu); + sched_show_task(cpu_curr(cpu)); +} From eee058826100e5a344f11601e0c47baeaf07c77b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Sep 2012 14:15:05 -0700 Subject: [PATCH 13/49] rcu: Add grace-period information to RCU CPU stall warnings This commit causes the last grace period started and completed to be printed on RCU CPU stall warning messages in order to aid diagnosis. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e78538712df024..8f4de3a7c1f717 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -947,8 +947,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); print_cpu_stall_info_end(); - printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n", - smp_processor_id(), (long)(jiffies - rsp->gp_start)); + pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu)\n", + smp_processor_id(), (long)(jiffies - rsp->gp_start), + rsp->gpnum, rsp->completed); if (ndetected == 0) printk(KERN_ERR "INFO: Stall ended before state dump start\n"); else if (!trigger_all_cpu_backtrace()) @@ -975,7 +976,8 @@ static void print_cpu_stall(struct rcu_state *rsp) print_cpu_stall_info_begin(); print_cpu_stall_info(rsp, smp_processor_id()); print_cpu_stall_info_end(); - printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start); + pr_cont(" (t=%lu jiffies g=%lu c=%lu)\n", + jiffies - rsp->gp_start, rsp->gpnum, rsp->completed); if (!trigger_all_cpu_backtrace()) dump_stack(); From 53bb857c373d6b7936f8a7b4451f0a99703c308e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Sep 2012 16:35:25 -0700 Subject: [PATCH 14/49] rcu: Dump number of callbacks in stall warning messages In theory, if a grace period manages to get started despite there being no callbacks on any of the CPUs, all CPUs could go into dyntick-idle mode, so that the grace period would never end. This commit updates the RCU CPU stall warning messages to detect this condition by summing up the number of callbacks on all CPUs. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 8f4de3a7c1f717..24b21cba2cc80f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -903,6 +903,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) unsigned long flags; int ndetected = 0; struct rcu_node *rnp = rcu_get_root(rsp); + long totqlen = 0; /* Only let one CPU complain about others per time interval. */ @@ -947,9 +948,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); print_cpu_stall_info_end(); - pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu)\n", + for_each_possible_cpu(cpu) + totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; + pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start), - rsp->gpnum, rsp->completed); + rsp->gpnum, rsp->completed, totqlen); if (ndetected == 0) printk(KERN_ERR "INFO: Stall ended before state dump start\n"); else if (!trigger_all_cpu_backtrace()) @@ -964,8 +967,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp) static void print_cpu_stall(struct rcu_state *rsp) { + int cpu; unsigned long flags; struct rcu_node *rnp = rcu_get_root(rsp); + long totqlen = 0; /* * OK, time to rat on ourselves... @@ -976,8 +981,10 @@ static void print_cpu_stall(struct rcu_state *rsp) print_cpu_stall_info_begin(); print_cpu_stall_info(rsp, smp_processor_id()); print_cpu_stall_info_end(); - pr_cont(" (t=%lu jiffies g=%lu c=%lu)\n", - jiffies - rsp->gp_start, rsp->gpnum, rsp->completed); + for_each_possible_cpu(cpu) + totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; + pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n", + jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen); if (!trigger_all_cpu_backtrace()) dump_stack(); From af71befa282ddf51c09509978abe1e83de8fe7eb Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 24 Oct 2012 11:07:09 -0700 Subject: [PATCH 15/49] rcu: Wordsmith help text for RCU_USER_QS kernel parameter This commit adds a "try" missing from the end of the first paragraph of the RCU_USER_QS help text. [ paulmck: Also fix up the last paragraph a bit. ] Signed-off-by: Paul Gortmaker Signed-off-by: Paul E. McKenney --- init/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index b63e7982c1c67e..ec62139207d3cf 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -494,11 +494,11 @@ config RCU_USER_QS puts RCU in extended quiescent state when the CPU runs in userspace. It means that when a CPU runs in userspace, it is excluded from the global RCU state machine and thus doesn't - to keep the timer tick on for RCU. + try to keep the timer tick on for RCU. Unless you want to hack and help the development of the full - tickless feature, you shouldn't enable this option. It adds - unnecessary overhead. + tickless feature, you shouldn't enable this option. It also + adds unnecessary overhead. If unsure say N From 55c6659afaa6fd79a3b5a7c2b42bb87e0c11209d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sat, 13 Oct 2012 01:14:16 +0800 Subject: [PATCH 16/49] srcu: Add DEFINE_SRCU() In old days, we had two different API sets for dynamic-allocated per-CPU data and DEFINE_PER_CPU()-defined per_cpu data, and because SRCU used dynamic-allocated per-CPU data, its srcu_struct structures cannot be declared statically. This commit therefore introduces DEFINE_SRCU() and DEFINE_STATIC_SRCU() to allow statically declared SRCU structures, using the new static per-CPU interfaces. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney [ paulmck: Updated for __DELAYED_WORK_INITIALIZER() added argument, fixed whitespace issue. ] --- include/linux/srcu.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 5cce128f196c02..6eb691b083581f 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -42,6 +42,8 @@ struct rcu_batch { struct rcu_head *head, **tail; }; +#define RCU_BATCH_INIT(name) { NULL, &(name.head) } + struct srcu_struct { unsigned completed; struct srcu_struct_array __percpu *per_cpu_ref; @@ -72,14 +74,42 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name, __init_srcu_struct((sp), #sp, &__srcu_key); \ }) +#define __SRCU_DEP_MAP_INIT(srcu_name) .dep_map = { .name = #srcu_name }, #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ int init_srcu_struct(struct srcu_struct *sp); +#define __SRCU_DEP_MAP_INIT(srcu_name) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ void process_srcu(struct work_struct *work); +#define __SRCU_STRUCT_INIT(name) \ + { \ + .completed = -300, \ + .per_cpu_ref = &name##_srcu_array, \ + .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ + .running = false, \ + .batch_queue = RCU_BATCH_INIT(name.batch_queue), \ + .batch_check0 = RCU_BATCH_INIT(name.batch_check0), \ + .batch_check1 = RCU_BATCH_INIT(name.batch_check1), \ + .batch_done = RCU_BATCH_INIT(name.batch_done), \ + .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\ + __SRCU_DEP_MAP_INIT(name) \ + } + +/* + * define and init a srcu struct at build time. + * dont't call init_srcu_struct() nor cleanup_srcu_struct() on it. + */ +#define DEFINE_SRCU(name) \ + static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\ + struct srcu_struct name = __SRCU_STRUCT_INIT(name); + +#define DEFINE_STATIC_SRCU(name) \ + static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\ + static struct srcu_struct name = __SRCU_STRUCT_INIT(name); + /** * call_srcu() - Queue a callback for invocation after an SRCU grace period * @sp: srcu_struct in queue the callback From cda4dc813071e6cb04944c5a140610bd06acd295 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sat, 13 Oct 2012 01:14:17 +0800 Subject: [PATCH 17/49] rcutorture: Use DEFINE_STATIC_SRCU() Use DEFINE_STATIC_SRCU() to simplify the rcutorture.c SRCU test code. Signed-off-by: Lai Jiangshan Reviewed-by: Josh Triplett Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 41 ++++++----------------------------------- 1 file changed, 6 insertions(+), 35 deletions(-) diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index aaa7b9f3532a86..f4019720cecab6 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -339,7 +339,6 @@ rcu_stutter_wait(char *title) struct rcu_torture_ops { void (*init)(void); - void (*cleanup)(void); int (*readlock)(void); void (*read_delay)(struct rcu_random_state *rrsp); void (*readunlock)(int idx); @@ -431,7 +430,6 @@ static void rcu_torture_deferred_free(struct rcu_torture *p) static struct rcu_torture_ops rcu_ops = { .init = NULL, - .cleanup = NULL, .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, @@ -475,7 +473,6 @@ static void rcu_sync_torture_init(void) static struct rcu_torture_ops rcu_sync_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, @@ -493,7 +490,6 @@ static struct rcu_torture_ops rcu_sync_ops = { static struct rcu_torture_ops rcu_expedited_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_torture_read_unlock, @@ -536,7 +532,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p) static struct rcu_torture_ops rcu_bh_ops = { .init = NULL, - .cleanup = NULL, .readlock = rcu_bh_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, @@ -553,7 +548,6 @@ static struct rcu_torture_ops rcu_bh_ops = { static struct rcu_torture_ops rcu_bh_sync_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = rcu_bh_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, @@ -570,7 +564,6 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { static struct rcu_torture_ops rcu_bh_expedited_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = rcu_bh_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, @@ -589,19 +582,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = { * Definitions for srcu torture testing. */ -static struct srcu_struct srcu_ctl; - -static void srcu_torture_init(void) -{ - init_srcu_struct(&srcu_ctl); - rcu_sync_torture_init(); -} - -static void srcu_torture_cleanup(void) -{ - synchronize_srcu(&srcu_ctl); - cleanup_srcu_struct(&srcu_ctl); -} +DEFINE_STATIC_SRCU(srcu_ctl); static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) { @@ -672,8 +653,7 @@ static int srcu_torture_stats(char *page) } static struct rcu_torture_ops srcu_ops = { - .init = srcu_torture_init, - .cleanup = srcu_torture_cleanup, + .init = rcu_sync_torture_init, .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, @@ -687,8 +667,7 @@ static struct rcu_torture_ops srcu_ops = { }; static struct rcu_torture_ops srcu_sync_ops = { - .init = srcu_torture_init, - .cleanup = srcu_torture_cleanup, + .init = rcu_sync_torture_init, .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, @@ -712,8 +691,7 @@ static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl) } static struct rcu_torture_ops srcu_raw_ops = { - .init = srcu_torture_init, - .cleanup = srcu_torture_cleanup, + .init = rcu_sync_torture_init, .readlock = srcu_torture_read_lock_raw, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock_raw, @@ -727,8 +705,7 @@ static struct rcu_torture_ops srcu_raw_ops = { }; static struct rcu_torture_ops srcu_raw_sync_ops = { - .init = srcu_torture_init, - .cleanup = srcu_torture_cleanup, + .init = rcu_sync_torture_init, .readlock = srcu_torture_read_lock_raw, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock_raw, @@ -747,8 +724,7 @@ static void srcu_torture_synchronize_expedited(void) } static struct rcu_torture_ops srcu_expedited_ops = { - .init = srcu_torture_init, - .cleanup = srcu_torture_cleanup, + .init = rcu_sync_torture_init, .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, @@ -783,7 +759,6 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p) static struct rcu_torture_ops sched_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, @@ -799,7 +774,6 @@ static struct rcu_torture_ops sched_ops = { static struct rcu_torture_ops sched_sync_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, @@ -814,7 +788,6 @@ static struct rcu_torture_ops sched_sync_ops = { static struct rcu_torture_ops sched_expedited_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, @@ -1936,8 +1909,6 @@ rcu_torture_cleanup(void) rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ - if (cur_ops->cleanup) - cur_ops->cleanup(); if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); else if (n_online_successes != n_online_attempts || From 57d34a6cee1399bfedaa73add1915951cbe75cab Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 19 Oct 2012 09:48:30 -0700 Subject: [PATCH 18/49] rcu: Update docs to include kfree_rcu() Mention kfree_rcu() in the call_rcu() section. Additionally fix the example code for list replacement that used the wrong structure element. Signed-off-by: Kees Cook Signed-off-by: Paul E. McKenney --- Documentation/RCU/listRCU.txt | 2 +- Documentation/RCU/whatisRCU.txt | 13 +++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/Documentation/RCU/listRCU.txt b/Documentation/RCU/listRCU.txt index 4349c1487e919c..adb5a37828467c 100644 --- a/Documentation/RCU/listRCU.txt +++ b/Documentation/RCU/listRCU.txt @@ -205,7 +205,7 @@ RCU ("read-copy update") its name. The RCU code is as follows: audit_copy_rule(&ne->rule, &e->rule); ne->rule.action = newaction; ne->rule.file_count = newfield_count; - list_replace_rcu(e, ne); + list_replace_rcu(&e->list, &ne->list); call_rcu(&e->rcu, audit_free_rule); return 0; } diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index bf0f6de2aa00c8..160ac5557e97f1 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -499,6 +499,8 @@ The foo_reclaim() function might appear as follows: { struct foo *fp = container_of(rp, struct foo, rcu); + foo_cleanup(fp->a); + kfree(fp); } @@ -521,6 +523,12 @@ o Use call_rcu() -after- removing a data element from an read-side critical sections that might be referencing that data item. +If the callback for call_rcu() is not doing anything more than calling +kfree() on the structure, you can use kfree_rcu() instead of call_rcu() +to avoid having to write your own callback: + + kfree_rcu(old_fp, rcu); + Again, see checklist.txt for additional rules governing the use of RCU. @@ -773,8 +781,8 @@ a single atomic update, converting to RCU will require special care. Also, the presence of synchronize_rcu() means that the RCU version of delete() can now block. If this is a problem, there is a callback-based -mechanism that never blocks, namely call_rcu(), that can be used in -place of synchronize_rcu(). +mechanism that never blocks, namely call_rcu() or kfree_rcu(), that can +be used in place of synchronize_rcu(). 7. FULL LIST OF RCU APIs @@ -813,6 +821,7 @@ RCU: Critical sections Grace period Barrier rcu_read_unlock synchronize_rcu rcu_dereference synchronize_rcu_expedited call_rcu + kfree_rcu bh: Critical sections Grace period Barrier From a4d611fdca0d696f9b8ffb007a119944ed5275fa Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 27 Oct 2012 16:34:51 -0700 Subject: [PATCH 19/49] rcu: Document alternative RCU/reference-count algorithms The approach for mixing RCU and reference counting listed in the RCU documentation only describes one possible approach. This approach can result in failure on the read side, which is nice if you want fresh data, but not so good if you want simple code. This commit therefore adds two additional approaches that feature unconditional reference-count acquisition by RCU readers. These approaches are very similar to that used in the security code. Signed-off-by: Paul E. McKenney --- Documentation/RCU/rcuref.txt | 61 ++++++++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 2 deletions(-) diff --git a/Documentation/RCU/rcuref.txt b/Documentation/RCU/rcuref.txt index 4202ad0931300f..141d531aa14b03 100644 --- a/Documentation/RCU/rcuref.txt +++ b/Documentation/RCU/rcuref.txt @@ -20,7 +20,7 @@ release_referenced() delete() { { ... write_lock(&list_lock); atomic_dec(&el->rc, relfunc) ... - ... delete_element + ... remove_element } write_unlock(&list_lock); ... if (atomic_dec_and_test(&el->rc)) @@ -52,7 +52,7 @@ release_referenced() delete() { { ... spin_lock(&list_lock); if (atomic_dec_and_test(&el->rc)) ... - call_rcu(&el->head, el_free); delete_element + call_rcu(&el->head, el_free); remove_element ... spin_unlock(&list_lock); } ... if (atomic_dec_and_test(&el->rc)) @@ -64,3 +64,60 @@ Sometimes, a reference to the element needs to be obtained in the update (write) stream. In such cases, atomic_inc_not_zero() might be overkill, since we hold the update-side spinlock. One might instead use atomic_inc() in such cases. + +It is not always convenient to deal with "FAIL" in the +search_and_reference() code path. In such cases, the +atomic_dec_and_test() may be moved from delete() to el_free() +as follows: + +1. 2. +add() search_and_reference() +{ { + alloc_object rcu_read_lock(); + ... search_for_element + atomic_set(&el->rc, 1); atomic_inc(&el->rc); + spin_lock(&list_lock); ... + + add_element rcu_read_unlock(); + ... } + spin_unlock(&list_lock); 4. +} delete() +3. { +release_referenced() spin_lock(&list_lock); +{ ... + ... remove_element + if (atomic_dec_and_test(&el->rc)) spin_unlock(&list_lock); + kfree(el); ... + ... call_rcu(&el->head, el_free); +} ... +5. } +void el_free(struct rcu_head *rhp) +{ + release_referenced(); +} + +The key point is that the initial reference added by add() is not removed +until after a grace period has elapsed following removal. This means that +search_and_reference() cannot find this element, which means that the value +of el->rc cannot increase. Thus, once it reaches zero, there are no +readers that can or ever will be able to reference the element. The +element can therefore safely be freed. This in turn guarantees that if +any reader finds the element, that reader may safely acquire a reference +without checking the value of the reference counter. + +In cases where delete() can sleep, synchronize_rcu() can be called from +delete(), so that el_free() can be subsumed into delete as follows: + +4. +delete() +{ + spin_lock(&list_lock); + ... + remove_element + spin_unlock(&list_lock); + ... + synchronize_rcu(); + if (atomic_dec_and_test(&el->rc)) + kfree(el); + ... +} From 7b2e6011f150c42235c4a541d20cf6891afe878a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 8 Oct 2012 10:54:03 -0700 Subject: [PATCH 20/49] rcu: Rename ->onofflock to ->orphan_lock The ->onofflock field in the rcu_state structure at one time synchronized CPU-hotplug operations for RCU. However, its scope has decreased over time so that it now only protects the lists of orphaned RCU callbacks. This commit therefore renames it to ->orphan_lock to reflect its current use. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 12 ++++++------ kernel/rcutree.h | 7 +++---- kernel/rcutree_plugin.h | 3 ++- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ac8aed8ee4175a..89148860e2baca 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -70,7 +70,7 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; .fqs_state = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ - .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \ + .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ .orphan_nxttail = &sname##_state.orphan_nxtlist, \ .orphan_donetail = &sname##_state.orphan_donelist, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ @@ -1573,7 +1573,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) /* * Send the specified CPU's RCU callbacks to the orphanage. The * specified CPU must be offline, and the caller must hold the - * ->onofflock. + * ->orphan_lock. */ static void rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, @@ -1623,7 +1623,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, /* * Adopt the RCU callbacks from the specified rcu_state structure's - * orphanage. The caller must hold the ->onofflock. + * orphanage. The caller must hold the ->orphan_lock. */ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) { @@ -1702,7 +1702,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Exclude any attempts to start a new grace period. */ mutex_lock(&rsp->onoff_mutex); - raw_spin_lock_irqsave(&rsp->onofflock, flags); + raw_spin_lock_irqsave(&rsp->orphan_lock, flags); /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); @@ -1729,10 +1729,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* * We still hold the leaf rcu_node structure lock here, and * irqs are still disabled. The reason for this subterfuge is - * because invoking rcu_report_unblock_qs_rnp() with ->onofflock + * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock * held leads to deadlock. */ - raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ + raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */ rnp = rdp->mynode; if (need_report & RCU_OFL_TASKS_NORM_GP) rcu_report_unblock_qs_rnp(rnp, flags); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index a240f032848eec..a7c945d149cfa8 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -383,9 +383,8 @@ struct rcu_state { /* End of fields guarded by root rcu_node's lock. */ - raw_spinlock_t onofflock ____cacheline_internodealigned_in_smp; - /* exclude on/offline and */ - /* starting new GP. */ + raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; + /* Protect following fields. */ struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ /* need a grace period. */ struct rcu_head **orphan_nxttail; /* Tail of above. */ @@ -394,7 +393,7 @@ struct rcu_state { struct rcu_head **orphan_donetail; /* Tail of above. */ long qlen_lazy; /* Number of lazy callbacks. */ long qlen; /* Total number of callbacks. */ - /* End of fields guarded by onofflock. */ + /* End of fields guarded by orphan_lock. */ struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index f921154881870b..2b281cf0b6f2da 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -757,7 +757,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, * grace period for the specified rcu_node structure. If there are no such * tasks, report it up the rcu_node hierarchy. * - * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock. + * Caller must hold sync_rcu_preempt_exp_mutex and must exclude + * CPU hotplug operations. */ static void sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) From 1924bcb0259711eea98491a7942d1ffbf677e114 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Oct 2012 12:30:37 -0700 Subject: [PATCH 21/49] rcu: Avoid counter wrap in synchronize_sched_expedited() There is a counter scheme similar to ticket locking that synchronize_sched_expedited() uses to service multiple concurrent callers with the same expedited grace period. Upon entry, a sync_sched_expedited_started variable is atomically incremented, and upon completion of a expedited grace period a separate sync_sched_expedited_done variable is atomically incremented. However, if a synchronize_sched_expedited() is delayed while in try_stop_cpus(), concurrent invocations will increment the sync_sched_expedited_started counter, which will eventually overflow. If the original synchronize_sched_expedited() resumes execution just as the counter overflows, a concurrent invocation could incorrectly conclude that an expedited grace period elapsed in zero time, which would be bad. One could rely on counter size to prevent this from happening in practice, but the goal is to formally validate this code, so it needs to be fixed anyway. This commit therefore checks the gap between the two counters before incrementing sync_sched_expedited_started, and if the gap is too large, does a normal grace period instead. Overflow is thus only possible if there are more than about 3.5 billion threads on 32-bit systems, which can be excluded until such time as task_struct fits into a single byte and 4G/4G patches are accepted into mainline. It is also easy to encode this limitation into mechanical theorem provers. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 62 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 18 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 89148860e2baca..678905555ca451 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2249,8 +2249,8 @@ void synchronize_rcu_bh(void) } EXPORT_SYMBOL_GPL(synchronize_rcu_bh); -static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); -static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); +static atomic_long_t sync_sched_expedited_started = ATOMIC_LONG_INIT(0); +static atomic_long_t sync_sched_expedited_done = ATOMIC_LONG_INIT(0); static int synchronize_sched_expedited_cpu_stop(void *data) { @@ -2308,10 +2308,30 @@ static int synchronize_sched_expedited_cpu_stop(void *data) */ void synchronize_sched_expedited(void) { - int firstsnap, s, snap, trycount = 0; + long firstsnap, s, snap; + int trycount = 0; - /* Note that atomic_inc_return() implies full memory barrier. */ - firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); + /* + * If we are in danger of counter wrap, just do synchronize_sched(). + * By allowing sync_sched_expedited_started to advance no more than + * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring + * that more than 3.5 billion CPUs would be required to force a + * counter wrap on a 32-bit system. Quite a few more CPUs would of + * course be required on a 64-bit system. + */ + if (ULONG_CMP_GE((ulong)atomic_read(&sync_sched_expedited_started), + (ulong)atomic_read(&sync_sched_expedited_done) + + ULONG_MAX / 8)) { + synchronize_sched(); + return; + } + + /* + * Take a ticket. Note that atomic_inc_return() implies a + * full memory barrier. + */ + snap = atomic_long_inc_return(&sync_sched_expedited_started); + firstsnap = snap; get_online_cpus(); WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); @@ -2324,6 +2344,13 @@ void synchronize_sched_expedited(void) NULL) == -EAGAIN) { put_online_cpus(); + /* Check to see if someone else did our work for us. */ + s = atomic_long_read(&sync_sched_expedited_done); + if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { + smp_mb(); /* ensure test happens before caller kfree */ + return; + } + /* No joy, try again later. Or just synchronize_sched(). */ if (trycount++ < 10) { udelay(trycount * num_online_cpus()); @@ -2332,23 +2359,22 @@ void synchronize_sched_expedited(void) return; } - /* Check to see if someone else did our work for us. */ - s = atomic_read(&sync_sched_expedited_done); - if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { + /* Recheck to see if someone else did our work for us. */ + s = atomic_long_read(&sync_sched_expedited_done); + if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { smp_mb(); /* ensure test happens before caller kfree */ return; } /* * Refetching sync_sched_expedited_started allows later - * callers to piggyback on our grace period. We subtract - * 1 to get the same token that the last incrementer got. - * We retry after they started, so our grace period works - * for them, and they started after our first try, so their - * grace period works for us. + * callers to piggyback on our grace period. We retry + * after they started, so our grace period works for them, + * and they started after our first try, so their grace + * period works for us. */ get_online_cpus(); - snap = atomic_read(&sync_sched_expedited_started); + snap = atomic_long_read(&sync_sched_expedited_started); smp_mb(); /* ensure read is before try_stop_cpus(). */ } @@ -2356,15 +2382,15 @@ void synchronize_sched_expedited(void) * Everyone up to our most recent fetch is covered by our grace * period. Update the counter, but only if our work is still * relevant -- which it won't be if someone who started later - * than we did beat us to the punch. + * than we did already did their update. */ do { - s = atomic_read(&sync_sched_expedited_done); - if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { + s = atomic_long_read(&sync_sched_expedited_done); + if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { smp_mb(); /* ensure test happens before caller kfree */ break; } - } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); + } while (atomic_long_cmpxchg(&sync_sched_expedited_done, s, snap) != s); put_online_cpus(); } From 40694d6644d5cca28531707559466122eb212d8b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Oct 2012 15:24:03 -0700 Subject: [PATCH 22/49] rcu: Move synchronize_sched_expedited() state to rcu_state Tracing (debugfs) of expedited RCU primitives is required, which in turn requires that the relevant data be located where the tracing code can find it, not in its current static global variables in kernel/rcutree.c. This commit therefore moves sync_sched_expedited_started and sync_sched_expedited_done to the rcu_state structure, as fields ->expedited_start and ->expedited_done, respectively. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 20 +++++++++----------- kernel/rcutree.h | 3 +++ 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 678905555ca451..3c72e5e5528c7e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2249,9 +2249,6 @@ void synchronize_rcu_bh(void) } EXPORT_SYMBOL_GPL(synchronize_rcu_bh); -static atomic_long_t sync_sched_expedited_started = ATOMIC_LONG_INIT(0); -static atomic_long_t sync_sched_expedited_done = ATOMIC_LONG_INIT(0); - static int synchronize_sched_expedited_cpu_stop(void *data) { /* @@ -2310,6 +2307,7 @@ void synchronize_sched_expedited(void) { long firstsnap, s, snap; int trycount = 0; + struct rcu_state *rsp = &rcu_sched_state; /* * If we are in danger of counter wrap, just do synchronize_sched(). @@ -2319,8 +2317,8 @@ void synchronize_sched_expedited(void) * counter wrap on a 32-bit system. Quite a few more CPUs would of * course be required on a 64-bit system. */ - if (ULONG_CMP_GE((ulong)atomic_read(&sync_sched_expedited_started), - (ulong)atomic_read(&sync_sched_expedited_done) + + if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start), + (ulong)atomic_long_read(&rsp->expedited_done) + ULONG_MAX / 8)) { synchronize_sched(); return; @@ -2330,7 +2328,7 @@ void synchronize_sched_expedited(void) * Take a ticket. Note that atomic_inc_return() implies a * full memory barrier. */ - snap = atomic_long_inc_return(&sync_sched_expedited_started); + snap = atomic_long_inc_return(&rsp->expedited_start); firstsnap = snap; get_online_cpus(); WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); @@ -2345,7 +2343,7 @@ void synchronize_sched_expedited(void) put_online_cpus(); /* Check to see if someone else did our work for us. */ - s = atomic_long_read(&sync_sched_expedited_done); + s = atomic_long_read(&rsp->expedited_done); if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { smp_mb(); /* ensure test happens before caller kfree */ return; @@ -2360,7 +2358,7 @@ void synchronize_sched_expedited(void) } /* Recheck to see if someone else did our work for us. */ - s = atomic_long_read(&sync_sched_expedited_done); + s = atomic_long_read(&rsp->expedited_done); if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { smp_mb(); /* ensure test happens before caller kfree */ return; @@ -2374,7 +2372,7 @@ void synchronize_sched_expedited(void) * period works for us. */ get_online_cpus(); - snap = atomic_long_read(&sync_sched_expedited_started); + snap = atomic_long_read(&rsp->expedited_start); smp_mb(); /* ensure read is before try_stop_cpus(). */ } @@ -2385,12 +2383,12 @@ void synchronize_sched_expedited(void) * than we did already did their update. */ do { - s = atomic_long_read(&sync_sched_expedited_done); + s = atomic_long_read(&rsp->expedited_done); if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { smp_mb(); /* ensure test happens before caller kfree */ break; } - } while (atomic_long_cmpxchg(&sync_sched_expedited_done, s, snap) != s); + } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s); put_online_cpus(); } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index a7c945d149cfa8..88f3d9d5971d74 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -404,6 +404,9 @@ struct rcu_state { /* _rcu_barrier(). */ /* End of fields guarded by barrier_mutex. */ + atomic_long_t expedited_start; /* Starting ticket. */ + atomic_long_t expedited_done; /* Done ticket. */ + unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ unsigned long n_force_qs; /* Number of calls to */ From a30489c5228fba6f16b4c740a0292879ef13371e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Oct 2012 16:18:09 -0700 Subject: [PATCH 23/49] rcu: Instrument synchronize_rcu_expedited() for debugfs tracing This commit adds the counters to rcu_state and updates them in synchronize_rcu_expedited() to provide the data needed for debugfs tracing. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 18 +++++++++++++++--- kernel/rcutree.h | 9 +++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 3c72e5e5528c7e..b966d56ebb5157 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2321,6 +2321,7 @@ void synchronize_sched_expedited(void) (ulong)atomic_long_read(&rsp->expedited_done) + ULONG_MAX / 8)) { synchronize_sched(); + atomic_long_inc(&rsp->expedited_wrap); return; } @@ -2341,11 +2342,14 @@ void synchronize_sched_expedited(void) synchronize_sched_expedited_cpu_stop, NULL) == -EAGAIN) { put_online_cpus(); + atomic_long_inc(&rsp->expedited_tryfail); /* Check to see if someone else did our work for us. */ s = atomic_long_read(&rsp->expedited_done); if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { - smp_mb(); /* ensure test happens before caller kfree */ + /* ensure test happens before caller kfree */ + smp_mb__before_atomic_inc(); /* ^^^ */ + atomic_long_inc(&rsp->expedited_workdone1); return; } @@ -2354,13 +2358,16 @@ void synchronize_sched_expedited(void) udelay(trycount * num_online_cpus()); } else { synchronize_sched(); + atomic_long_inc(&rsp->expedited_normal); return; } /* Recheck to see if someone else did our work for us. */ s = atomic_long_read(&rsp->expedited_done); if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { - smp_mb(); /* ensure test happens before caller kfree */ + /* ensure test happens before caller kfree */ + smp_mb__before_atomic_inc(); /* ^^^ */ + atomic_long_inc(&rsp->expedited_workdone2); return; } @@ -2375,6 +2382,7 @@ void synchronize_sched_expedited(void) snap = atomic_long_read(&rsp->expedited_start); smp_mb(); /* ensure read is before try_stop_cpus(). */ } + atomic_long_inc(&rsp->expedited_stoppedcpus); /* * Everyone up to our most recent fetch is covered by our grace @@ -2383,12 +2391,16 @@ void synchronize_sched_expedited(void) * than we did already did their update. */ do { + atomic_long_inc(&rsp->expedited_done_tries); s = atomic_long_read(&rsp->expedited_done); if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { - smp_mb(); /* ensure test happens before caller kfree */ + /* ensure test happens before caller kfree */ + smp_mb__before_atomic_inc(); /* ^^^ */ + atomic_long_inc(&rsp->expedited_done_lost); break; } } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s); + atomic_long_inc(&rsp->expedited_done_exit); put_online_cpus(); } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 88f3d9d5971d74..d274af3572103d 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -406,6 +406,15 @@ struct rcu_state { atomic_long_t expedited_start; /* Starting ticket. */ atomic_long_t expedited_done; /* Done ticket. */ + atomic_long_t expedited_wrap; /* # near-wrap incidents. */ + atomic_long_t expedited_tryfail; /* # acquisition failures. */ + atomic_long_t expedited_workdone1; /* # done by others #1. */ + atomic_long_t expedited_workdone2; /* # done by others #2. */ + atomic_long_t expedited_normal; /* # fallbacks to normal. */ + atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */ + atomic_long_t expedited_done_tries; /* # tries to update _done. */ + atomic_long_t expedited_done_lost; /* # times beaten to _done. */ + atomic_long_t expedited_done_exit; /* # times exited _done loop. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ From 573bcd40d221bd6d7cebf27dee120bd242f5feb5 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 20 Sep 2012 08:51:02 +0800 Subject: [PATCH 24/49] rcu: Create directory for each flavor of rcu This patch will create subdirectory according to each flavor of rcu, the new structure will be: /debugfs/rcu/ -> rsp_0 -> rsp_1 -> ... So we can go to '/debugfs/rcu/rsp_0' and get the cpu info of rsp_0 there. The flavors of RCU are currently rcu_bh, rcu_preempt, and rcu_sched. Signed-off-by: Michael Wang Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 693513bc50e6d0..62223a27f985e0 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -446,12 +446,20 @@ static struct dentry *rcudir; static int __init rcutree_trace_init(void) { + struct rcu_state *rsp; struct dentry *retval; + struct dentry *rspdir; rcudir = debugfs_create_dir("rcu", NULL); if (!rcudir) goto free_out; + for_each_rcu_flavor(rsp) { + rspdir = debugfs_create_dir(rsp->name, rcudir); + if (!rspdir) + goto free_out; + } + retval = debugfs_create_file("rcubarrier", 0444, rcudir, NULL, &rcubarrier_fops); if (!retval) From 374b928ee8061fdbb0b527fb3924080ba2437767 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 20 Sep 2012 08:51:03 +0800 Subject: [PATCH 25/49] rcu: Fundamental facility for 'CPU units sequence reading' This patch add the fundamental facility used by the following patches, so we can implement the 'CPU units sequence reading' later. This helps us avoid losing data when there are too many CPUs and too small of a buffer, since this new approach allows userspace to read out the data one CPU at a time. Thus, if the buffer is not large enough, userspace will get whatever CPUs fit, and can then issue another read for the remainder of the data. Signed-off-by: Michael Wang Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 62223a27f985e0..0dfe9b512f0f92 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -46,6 +46,36 @@ #define RCU_TREE_NONCORE #include "rcutree.h" +static int r_open(struct inode *inode, struct file *file, + const struct seq_operations *op) +{ + int ret = seq_open(file, op); + if (!ret) { + struct seq_file *m = (struct seq_file *)file->private_data; + m->private = inode->i_private; + } + return ret; +} + +static void *r_start(struct seq_file *m, loff_t *pos) +{ + struct rcu_state *rsp = (struct rcu_state *)m->private; + *pos = cpumask_next(*pos - 1, cpu_possible_mask); + if ((*pos) < nr_cpu_ids) + return per_cpu_ptr(rsp->rda, *pos); + return NULL; +} + +static void *r_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return r_start(m, pos); +} + +static void r_stop(struct seq_file *m, void *v) +{ +} + static int show_rcubarrier(struct seq_file *m, void *unused) { struct rcu_state *rsp; From 878eda72e24d11e463a25b1dc7097a8d953f17cb Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 20 Sep 2012 08:51:04 +0800 Subject: [PATCH 26/49] rcu: Optimize the 'rcudata' for RCU trace This patch implements the new 'rcudata' interface under each rsp directory, by using the 'CPU units sequence reading', thus avoiding loss of tracing data. Signed-off-by: Michael Wang Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 0dfe9b512f0f92..a11522f62c714b 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -174,6 +174,32 @@ static const struct file_operations rcudata_fops = { .release = single_release, }; +static int new_show_rcudata(struct seq_file *m, void *v) +{ + print_one_rcu_data(m, (struct rcu_data *)v); + return 0; +} + +static const struct seq_operations new_rcudate_op = { + .start = r_start, + .next = r_next, + .stop = r_stop, + .show = new_show_rcudata, +}; + +static int new_rcudata_open(struct inode *inode, struct file *file) +{ + return r_open(inode, file, &new_rcudate_op); +} + +static const struct file_operations new_rcudata_fops = { + .owner = THIS_MODULE, + .open = new_rcudata_open, + .read = seq_read, + .llseek = no_llseek, + .release = seq_release, +}; + static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) { if (!rdp->beenonline) @@ -488,6 +514,11 @@ static int __init rcutree_trace_init(void) rspdir = debugfs_create_dir(rsp->name, rcudir); if (!rspdir) goto free_out; + + retval = debugfs_create_file("rcudata", 0444, + rspdir, rsp, &new_rcudata_fops); + if (!retval) + goto free_out; } retval = debugfs_create_file("rcubarrier", 0444, rcudir, From d29200efa2ad7a1dc516a1048faf98dcc01b9fef Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 20 Sep 2012 08:51:05 +0800 Subject: [PATCH 27/49] rcu: Optimize the 'rcudata.csv' for RCU trace This patch implements the new 'rcudata.csv' interface under each rsp directory, by using the 'CPU units sequence reading', thus avoiding loss of tracing data. Signed-off-by: Michael Wang Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index a11522f62c714b..e387a642632b38 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -267,6 +267,43 @@ static const struct file_operations rcudata_csv_fops = { .release = single_release, }; +static int new_show_rcudata_csv(struct seq_file *m, void *v) +{ + struct rcu_data *rdp = (struct rcu_data *)v; + if (cpumask_first(cpu_possible_mask) == rdp->cpu) { + seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pq\","); + seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); + seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); +#ifdef CONFIG_RCU_BOOST + seq_puts(m, "\"kt\",\"ktl\""); +#endif /* #ifdef CONFIG_RCU_BOOST */ + seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); + } + + print_one_rcu_data_csv(m, rdp); + return 0; +} + +static const struct seq_operations new_rcudate_csv_op = { + .start = r_start, + .next = r_next, + .stop = r_stop, + .show = new_show_rcudata_csv, +}; + +static int new_rcudata_csv_open(struct inode *inode, struct file *file) +{ + return r_open(inode, file, &new_rcudate_csv_op); +} + +static const struct file_operations new_rcudata_csv_fops = { + .owner = THIS_MODULE, + .open = new_rcudata_csv_open, + .read = seq_read, + .llseek = no_llseek, + .release = seq_release, +}; + #ifdef CONFIG_RCU_BOOST static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) @@ -519,6 +556,11 @@ static int __init rcutree_trace_init(void) rspdir, rsp, &new_rcudata_fops); if (!retval) goto free_out; + + retval = debugfs_create_file("rcudata.csv", 0444, + rspdir, rsp, &new_rcudata_csv_fops); + if (!retval) + goto free_out; } retval = debugfs_create_file("rcubarrier", 0444, rcudir, From 51d0f16d49f6a99189e80c50e18a12325664ef41 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 20 Sep 2012 08:51:06 +0800 Subject: [PATCH 28/49] rcu: Optimize the 'rcu_pending' for RCU trace This patch implements the new 'rcu_pending' interface under each rsp directory, by using the 'CPU units sequence reading', thus avoiding loss of tracing data. Signed-off-by: Michael Wang Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index e387a642632b38..8b248672221130 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -467,6 +467,8 @@ static const struct file_operations rcugp_fops = { static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) { + if (!rdp->beenonline) + return; seq_printf(m, "%3d%cnp=%ld ", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', @@ -485,16 +487,12 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) static int show_rcu_pending(struct seq_file *m, void *unused) { int cpu; - struct rcu_data *rdp; struct rcu_state *rsp; for_each_rcu_flavor(rsp) { seq_printf(m, "%s:\n", rsp->name); - for_each_possible_cpu(cpu) { - rdp = per_cpu_ptr(rsp->rda, cpu); - if (rdp->beenonline) - print_one_rcu_pending(m, rdp); - } + for_each_possible_cpu(cpu) + print_one_rcu_pending(m, per_cpu_ptr(rsp->rda, cpu)); } return 0; } @@ -512,6 +510,32 @@ static const struct file_operations rcu_pending_fops = { .release = single_release, }; +static int new_show_rcu_pending(struct seq_file *m, void *v) +{ + print_one_rcu_pending(m, (struct rcu_data *)v); + return 0; +} + +static const struct seq_operations new_rcu_pending_op = { + .start = r_start, + .next = r_next, + .stop = r_stop, + .show = new_show_rcu_pending, +}; + +static int new_rcu_pending_open(struct inode *inode, struct file *file) +{ + return r_open(inode, file, &new_rcu_pending_op); +} + +static const struct file_operations new_rcu_pending_fops = { + .owner = THIS_MODULE, + .open = new_rcu_pending_open, + .read = seq_read, + .llseek = no_llseek, + .release = seq_release, +}; + static int show_rcutorture(struct seq_file *m, void *unused) { seq_printf(m, "rcutorture test sequence: %lu %s\n", @@ -561,6 +585,11 @@ static int __init rcutree_trace_init(void) rspdir, rsp, &new_rcudata_csv_fops); if (!retval) goto free_out; + + retval = debugfs_create_file("rcu_pending", 0444, + rspdir, rsp, &new_rcu_pending_fops); + if (!retval) + goto free_out; } retval = debugfs_create_file("rcubarrier", 0444, rcudir, From c011c41f11c4bf9b0c8d489a458770c24aeb2ebd Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 20 Sep 2012 08:51:07 +0800 Subject: [PATCH 29/49] rcu: Replace the old interface with the new one This patch removed the old RCU debugfs interface and replaced it with the new one. Signed-off-by: Michael Wang Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 148 +++++++---------------------------------- 1 file changed, 24 insertions(+), 124 deletions(-) diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 8b248672221130..0e2ab6427b6462 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -148,53 +148,27 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } -static int show_rcudata(struct seq_file *m, void *unused) -{ - int cpu; - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) { - seq_printf(m, "%s:\n", rsp->name); - for_each_possible_cpu(cpu) - print_one_rcu_data(m, per_cpu_ptr(rsp->rda, cpu)); - } - return 0; -} - -static int rcudata_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcudata, NULL); -} - -static const struct file_operations rcudata_fops = { - .owner = THIS_MODULE, - .open = rcudata_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int new_show_rcudata(struct seq_file *m, void *v) +static int show_rcudata(struct seq_file *m, void *v) { print_one_rcu_data(m, (struct rcu_data *)v); return 0; } -static const struct seq_operations new_rcudate_op = { +static const struct seq_operations rcudate_op = { .start = r_start, .next = r_next, .stop = r_stop, - .show = new_show_rcudata, + .show = show_rcudata, }; -static int new_rcudata_open(struct inode *inode, struct file *file) +static int rcudata_open(struct inode *inode, struct file *file) { - return r_open(inode, file, &new_rcudate_op); + return r_open(inode, file, &rcudate_op); } -static const struct file_operations new_rcudata_fops = { +static const struct file_operations rcudata_fops = { .owner = THIS_MODULE, - .open = new_rcudata_open, + .open = rcudata_open, .read = seq_read, .llseek = no_llseek, .release = seq_release, @@ -234,40 +208,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } -static int show_rcudata_csv(struct seq_file *m, void *unused) -{ - int cpu; - struct rcu_state *rsp; - - seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pq\","); - seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); - seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); -#ifdef CONFIG_RCU_BOOST - seq_puts(m, "\"kt\",\"ktl\""); -#endif /* #ifdef CONFIG_RCU_BOOST */ - seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); - for_each_rcu_flavor(rsp) { - seq_printf(m, "\"%s:\"\n", rsp->name); - for_each_possible_cpu(cpu) - print_one_rcu_data_csv(m, per_cpu_ptr(rsp->rda, cpu)); - } - return 0; -} - -static int rcudata_csv_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcudata_csv, NULL); -} - -static const struct file_operations rcudata_csv_fops = { - .owner = THIS_MODULE, - .open = rcudata_csv_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int new_show_rcudata_csv(struct seq_file *m, void *v) +static int show_rcudata_csv(struct seq_file *m, void *v) { struct rcu_data *rdp = (struct rcu_data *)v; if (cpumask_first(cpu_possible_mask) == rdp->cpu) { @@ -284,21 +225,21 @@ static int new_show_rcudata_csv(struct seq_file *m, void *v) return 0; } -static const struct seq_operations new_rcudate_csv_op = { +static const struct seq_operations rcudate_csv_op = { .start = r_start, .next = r_next, .stop = r_stop, - .show = new_show_rcudata_csv, + .show = show_rcudata_csv, }; -static int new_rcudata_csv_open(struct inode *inode, struct file *file) +static int rcudata_csv_open(struct inode *inode, struct file *file) { - return r_open(inode, file, &new_rcudate_csv_op); + return r_open(inode, file, &rcudate_csv_op); } -static const struct file_operations new_rcudata_csv_fops = { +static const struct file_operations rcudata_csv_fops = { .owner = THIS_MODULE, - .open = new_rcudata_csv_open, + .open = rcudata_csv_open, .read = seq_read, .llseek = no_llseek, .release = seq_release, @@ -484,53 +425,27 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) rdp->n_rp_need_nothing); } -static int show_rcu_pending(struct seq_file *m, void *unused) -{ - int cpu; - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) { - seq_printf(m, "%s:\n", rsp->name); - for_each_possible_cpu(cpu) - print_one_rcu_pending(m, per_cpu_ptr(rsp->rda, cpu)); - } - return 0; -} - -static int rcu_pending_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcu_pending, NULL); -} - -static const struct file_operations rcu_pending_fops = { - .owner = THIS_MODULE, - .open = rcu_pending_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int new_show_rcu_pending(struct seq_file *m, void *v) +static int show_rcu_pending(struct seq_file *m, void *v) { print_one_rcu_pending(m, (struct rcu_data *)v); return 0; } -static const struct seq_operations new_rcu_pending_op = { +static const struct seq_operations rcu_pending_op = { .start = r_start, .next = r_next, .stop = r_stop, - .show = new_show_rcu_pending, + .show = show_rcu_pending, }; -static int new_rcu_pending_open(struct inode *inode, struct file *file) +static int rcu_pending_open(struct inode *inode, struct file *file) { - return r_open(inode, file, &new_rcu_pending_op); + return r_open(inode, file, &rcu_pending_op); } -static const struct file_operations new_rcu_pending_fops = { +static const struct file_operations rcu_pending_fops = { .owner = THIS_MODULE, - .open = new_rcu_pending_open, + .open = rcu_pending_open, .read = seq_read, .llseek = no_llseek, .release = seq_release, @@ -577,17 +492,17 @@ static int __init rcutree_trace_init(void) goto free_out; retval = debugfs_create_file("rcudata", 0444, - rspdir, rsp, &new_rcudata_fops); + rspdir, rsp, &rcudata_fops); if (!retval) goto free_out; retval = debugfs_create_file("rcudata.csv", 0444, - rspdir, rsp, &new_rcudata_csv_fops); + rspdir, rsp, &rcudata_csv_fops); if (!retval) goto free_out; retval = debugfs_create_file("rcu_pending", 0444, - rspdir, rsp, &new_rcu_pending_fops); + rspdir, rsp, &rcu_pending_fops); if (!retval) goto free_out; } @@ -597,16 +512,6 @@ static int __init rcutree_trace_init(void) if (!retval) goto free_out; - retval = debugfs_create_file("rcudata", 0444, rcudir, - NULL, &rcudata_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcudata.csv", 0444, rcudir, - NULL, &rcudata_csv_fops); - if (!retval) - goto free_out; - if (rcu_boost_trace_create_file(rcudir)) goto free_out; @@ -619,11 +524,6 @@ static int __init rcutree_trace_init(void) if (!retval) goto free_out; - retval = debugfs_create_file("rcu_pending", 0444, rcudir, - NULL, &rcu_pending_fops); - if (!retval) - goto free_out; - retval = debugfs_create_file("rcutorture", 0444, rcudir, NULL, &rcutorture_fops); if (!retval) From 5f4ee1fa16fa1bee673b75722ca43350a74fba36 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 20 Sep 2012 08:51:08 +0800 Subject: [PATCH 30/49] rcu: Remove the interface "rcudata.csv" This patch removes the interface "rcudata.csv" since it is apparently not used. Signed-off-by: Michael Wang Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 76 ------------------------------------------ 1 file changed, 76 deletions(-) diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 0e2ab6427b6462..bcc4865fea8ba4 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -174,77 +174,6 @@ static const struct file_operations rcudata_fops = { .release = seq_release, }; -static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) -{ - if (!rdp->beenonline) - return; - seq_printf(m, "%d,%s,%lu,%lu,%d,%d", - rdp->cpu, - cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", - rdp->completed, rdp->gpnum, - rdp->passed_quiesce, rdp->qs_pending); - seq_printf(m, ",%d,%llx,%d,%lu", - atomic_read(&rdp->dynticks->dynticks), - rdp->dynticks->dynticks_nesting, - rdp->dynticks->dynticks_nmi_nesting, - rdp->dynticks_fqs); - seq_printf(m, ",%lu", rdp->offline_fqs); - seq_printf(m, ",%ld,%ld,\"%c%c%c%c\"", rdp->qlen_lazy, rdp->qlen, - ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != - rdp->nxttail[RCU_NEXT_TAIL]], - ".R"[rdp->nxttail[RCU_WAIT_TAIL] != - rdp->nxttail[RCU_NEXT_READY_TAIL]], - ".W"[rdp->nxttail[RCU_DONE_TAIL] != - rdp->nxttail[RCU_WAIT_TAIL]], - ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); -#ifdef CONFIG_RCU_BOOST - seq_printf(m, ",%d,\"%c\"", - per_cpu(rcu_cpu_has_work, rdp->cpu), - convert_kthread_status(per_cpu(rcu_cpu_kthread_status, - rdp->cpu))); -#endif /* #ifdef CONFIG_RCU_BOOST */ - seq_printf(m, ",%ld", rdp->blimit); - seq_printf(m, ",%lu,%lu,%lu\n", - rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); -} - -static int show_rcudata_csv(struct seq_file *m, void *v) -{ - struct rcu_data *rdp = (struct rcu_data *)v; - if (cpumask_first(cpu_possible_mask) == rdp->cpu) { - seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pq\","); - seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); - seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); -#ifdef CONFIG_RCU_BOOST - seq_puts(m, "\"kt\",\"ktl\""); -#endif /* #ifdef CONFIG_RCU_BOOST */ - seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); - } - - print_one_rcu_data_csv(m, rdp); - return 0; -} - -static const struct seq_operations rcudate_csv_op = { - .start = r_start, - .next = r_next, - .stop = r_stop, - .show = show_rcudata_csv, -}; - -static int rcudata_csv_open(struct inode *inode, struct file *file) -{ - return r_open(inode, file, &rcudate_csv_op); -} - -static const struct file_operations rcudata_csv_fops = { - .owner = THIS_MODULE, - .open = rcudata_csv_open, - .read = seq_read, - .llseek = no_llseek, - .release = seq_release, -}; - #ifdef CONFIG_RCU_BOOST static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) @@ -496,11 +425,6 @@ static int __init rcutree_trace_init(void) if (!retval) goto free_out; - retval = debugfs_create_file("rcudata.csv", 0444, - rspdir, rsp, &rcudata_csv_fops); - if (!retval) - goto free_out; - retval = debugfs_create_file("rcu_pending", 0444, rspdir, rsp, &rcu_pending_fops); if (!retval) From 42c3533eee88e012e1aa3c4d6d2cc53354130e24 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Sep 2012 10:49:58 -0700 Subject: [PATCH 31/49] rcu: Fix tracing formatting The rcu_state structure's ->completed field is unsigned long, so this commit adjusts show_one_rcugp()'s printf() format to suit. Also add the required ACCESS_ONCE() directives while we are in this function. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 4 ++-- kernel/rcutree_trace.c | 21 ++++++++++++--------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index b966d56ebb5157..8ed9c481db035b 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -68,8 +68,8 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; .level = { &sname##_state.node[0] }, \ .call = cr, \ .fqs_state = RCU_GP_IDLE, \ - .gpnum = -300, \ - .completed = -300, \ + .gpnum = 0UL - 300UL, \ + .completed = 0UL - 300UL, \ .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ .orphan_nxttail = &sname##_state.orphan_nxtlist, \ .orphan_donetail = &sname##_state.orphan_donelist, \ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index bcc4865fea8ba4..3312ed7e411e74 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -46,6 +46,8 @@ #define RCU_TREE_NONCORE #include "rcutree.h" +#define ulong2long(a) (*(long *)(&(a))) + static int r_open(struct inode *inode, struct file *file, const struct seq_operations *op) { @@ -116,10 +118,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) { if (!rdp->beenonline) return; - seq_printf(m, "%3d%cc=%lu g=%lu pq=%d qp=%d", + seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', - rdp->completed, rdp->gpnum, + ulong2long(rdp->completed), ulong2long(rdp->gpnum), rdp->passed_quiesce, rdp->qs_pending); seq_printf(m, " dt=%d/%llx/%d df=%lu", atomic_read(&rdp->dynticks->dynticks), @@ -246,8 +248,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) struct rcu_node *rnp; gpnum = rsp->gpnum; - seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x ", - rsp->name, rsp->completed, gpnum, rsp->fqs_state, + seq_printf(m, "%s: c=%ld g=%ld s=%d jfq=%ld j=%x ", + rsp->name, ulong2long(rsp->completed), ulong2long(gpnum), + rsp->fqs_state, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff)); seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", @@ -301,16 +304,16 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) struct rcu_node *rnp = &rsp->node[0]; raw_spin_lock_irqsave(&rnp->lock, flags); - completed = rsp->completed; - gpnum = rsp->gpnum; - if (rsp->completed == rsp->gpnum) + completed = ACCESS_ONCE(rsp->completed); + gpnum = ACCESS_ONCE(rsp->gpnum); + if (completed == gpnum) gpage = 0; else gpage = jiffies - rsp->gp_start; gpmax = rsp->gp_max; raw_spin_unlock_irqrestore(&rnp->lock, flags); - seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n", - rsp->name, completed, gpnum, gpage, gpmax); + seq_printf(m, "%s: completed=%ld gpnum=%ld age=%ld max=%ld\n", + rsp->name, ulong2long(completed), ulong2long(gpnum), gpage, gpmax); } static int show_rcugp(struct seq_file *m, void *unused) From c25e557f5d49a7cb94fad473f5ced75b6c7ce094 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 8 Oct 2012 16:59:16 +0800 Subject: [PATCH 32/49] rcu: split 'rcubarrier' to each flavor This patch add new 'rcubarrier' to each flavor's folder, now we could use: 'cat /debugfs/rcu/rsp/rcubarrier' to get the selected rsp info. Signed-off-by: Michael Wang Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 3312ed7e411e74..65b6265531ff2d 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -103,6 +103,28 @@ static const struct file_operations rcubarrier_fops = { .release = single_release, }; +static int new_show_rcubarrier(struct seq_file *m, void *v) +{ + struct rcu_state *rsp = (struct rcu_state *)m->private; + seq_printf(m, "bcc: %d nbd: %lu\n", + atomic_read(&rsp->barrier_cpu_count), + rsp->n_barrier_done); + return 0; +} + +static int new_rcubarrier_open(struct inode *inode, struct file *file) +{ + return single_open(file, new_show_rcubarrier, inode->i_private); +} + +static const struct file_operations new_rcubarrier_fops = { + .owner = THIS_MODULE, + .open = new_rcubarrier_open, + .read = seq_read, + .llseek = no_llseek, + .release = seq_release, +}; + #ifdef CONFIG_RCU_BOOST static char convert_kthread_status(unsigned int kthread_status) @@ -432,6 +454,11 @@ static int __init rcutree_trace_init(void) rspdir, rsp, &rcu_pending_fops); if (!retval) goto free_out; + + retval = debugfs_create_file("rcubarrier", 0444, + rspdir, rsp, &new_rcubarrier_fops); + if (!retval) + goto free_out; } retval = debugfs_create_file("rcubarrier", 0444, rcudir, From 29c67764f121a0980eb30d0314821ea631e6cfaf Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 8 Oct 2012 16:59:17 +0800 Subject: [PATCH 33/49] rcu: split 'rcuboost' to each flavor This patch add new 'rcuboost' to each flavor's folder, now we could use: 'cat /debugfs/rcu/rsp/rcuboost' to get the selected rsp info. Signed-off-by: Michael Wang Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 65b6265531ff2d..cae417de4b9417 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -241,7 +241,7 @@ static const struct file_operations rcu_node_boost_fops = { .owner = THIS_MODULE, .open = rcu_node_boost_open, .read = seq_read, - .llseek = seq_lseek, + .llseek = no_llseek, .release = single_release, }; @@ -459,6 +459,15 @@ static int __init rcutree_trace_init(void) rspdir, rsp, &new_rcubarrier_fops); if (!retval) goto free_out; + +#ifdef CONFIG_RCU_BOOST + if (rsp == &rcu_preempt_state) { + retval = debugfs_create_file("rcuboost", 0444, + rspdir, NULL, &rcu_node_boost_fops); + if (!retval) + goto free_out; + } +#endif } retval = debugfs_create_file("rcubarrier", 0444, rcudir, From 66b38bc52bd1e0d00987e23bf7153c46201ff2ba Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 8 Oct 2012 16:59:18 +0800 Subject: [PATCH 34/49] rcu: split 'rcugp' to each flavor This patch add new 'rcugp' to each flavor's folder, now we could use: 'cat /debugfs/rcu/rsp/rcugp' to get the selected rsp info. Signed-off-by: Michael Wang Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index cae417de4b9417..c90d2a917def81 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -360,6 +360,26 @@ static const struct file_operations rcugp_fops = { .release = single_release, }; +static int new_show_rcugp(struct seq_file *m, void *v) +{ + struct rcu_state *rsp = (struct rcu_state *)m->private; + show_one_rcugp(m, rsp); + return 0; +} + +static int new_rcugp_open(struct inode *inode, struct file *file) +{ + return single_open(file, new_show_rcugp, inode->i_private); +} + +static const struct file_operations new_rcugp_fops = { + .owner = THIS_MODULE, + .open = new_rcugp_open, + .read = seq_read, + .llseek = no_llseek, + .release = seq_release, +}; + static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) { if (!rdp->beenonline) @@ -468,6 +488,12 @@ static int __init rcutree_trace_init(void) goto free_out; } #endif + + retval = debugfs_create_file("rcugp", 0444, + rspdir, rsp, &new_rcugp_fops); + if (!retval) + goto free_out; + } retval = debugfs_create_file("rcubarrier", 0444, rcudir, From a608d84bdb832a86ad3fdb0767df31fcda9fe280 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 8 Oct 2012 16:59:19 +0800 Subject: [PATCH 35/49] rcu: split 'rcuhier' to each flavor This patch add new 'rcuhier' to each flavor's folder, now we could use: 'cat /debugfs/rcu/rsp/rcuhier' to get the selected rsp info. Signed-off-by: Michael Wang Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index c90d2a917def81..107997ecdeebcb 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -316,6 +316,26 @@ static const struct file_operations rcuhier_fops = { .release = single_release, }; +static int new_show_rcuhier(struct seq_file *m, void *v) +{ + struct rcu_state *rsp = (struct rcu_state *)m->private; + print_one_rcu_state(m, rsp); + return 0; +} + +static int new_rcuhier_open(struct inode *inode, struct file *file) +{ + return single_open(file, new_show_rcuhier, inode->i_private); +} + +static const struct file_operations new_rcuhier_fops = { + .owner = THIS_MODULE, + .open = new_rcuhier_open, + .read = seq_read, + .llseek = no_llseek, + .release = seq_release, +}; + static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) { unsigned long flags; @@ -494,6 +514,10 @@ static int __init rcutree_trace_init(void) if (!retval) goto free_out; + retval = debugfs_create_file("rcuhier", 0444, + rspdir, rsp, &new_rcuhier_fops); + if (!retval) + goto free_out; } retval = debugfs_create_file("rcubarrier", 0444, rcudir, From 6ee0886ff6c81526bf6ad8521d843b934aafa5aa Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 11 Oct 2012 14:26:42 -0700 Subject: [PATCH 36/49] rcu: Remove old debugfs interfaces and also RCU flavor name This commit removes the old debugfs interfaces, so that the new directory-per-RCU-flavor versions remain. Because the RCU flavor is given by the directory name, there is no need to print it out, so remove the name from the printout. Signed-off-by: Michael Wang Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 190 ++++++++++------------------------------- 1 file changed, 44 insertions(+), 146 deletions(-) diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 107997ecdeebcb..967115c508bc40 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -78,32 +78,7 @@ static void r_stop(struct seq_file *m, void *v) { } -static int show_rcubarrier(struct seq_file *m, void *unused) -{ - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) - seq_printf(m, "%s: bcc: %d nbd: %lu\n", - rsp->name, - atomic_read(&rsp->barrier_cpu_count), - rsp->n_barrier_done); - return 0; -} - -static int rcubarrier_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcubarrier, NULL); -} - -static const struct file_operations rcubarrier_fops = { - .owner = THIS_MODULE, - .open = rcubarrier_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int new_show_rcubarrier(struct seq_file *m, void *v) +static int show_rcubarrier(struct seq_file *m, void *v) { struct rcu_state *rsp = (struct rcu_state *)m->private; seq_printf(m, "bcc: %d nbd: %lu\n", @@ -112,14 +87,14 @@ static int new_show_rcubarrier(struct seq_file *m, void *v) return 0; } -static int new_rcubarrier_open(struct inode *inode, struct file *file) +static int rcubarrier_open(struct inode *inode, struct file *file) { - return single_open(file, new_show_rcubarrier, inode->i_private); + return single_open(file, show_rcubarrier, inode->i_private); } -static const struct file_operations new_rcubarrier_fops = { +static const struct file_operations rcubarrier_fops = { .owner = THIS_MODULE, - .open = new_rcubarrier_open, + .open = rcubarrier_open, .read = seq_read, .llseek = no_llseek, .release = seq_release, @@ -245,23 +220,7 @@ static const struct file_operations rcu_node_boost_fops = { .release = single_release, }; -/* - * Create the rcuboost debugfs entry. Standard error return. - */ -static int rcu_boost_trace_create_file(struct dentry *rcudir) -{ - return !debugfs_create_file("rcuboost", 0444, rcudir, NULL, - &rcu_node_boost_fops); -} - -#else /* #ifdef CONFIG_RCU_BOOST */ - -static int rcu_boost_trace_create_file(struct dentry *rcudir) -{ - return 0; /* There cannot be an error if we didn't create it! */ -} - -#endif /* #else #ifdef CONFIG_RCU_BOOST */ +#endif /* #ifdef CONFIG_RCU_BOOST */ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) { @@ -270,8 +229,8 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) struct rcu_node *rnp; gpnum = rsp->gpnum; - seq_printf(m, "%s: c=%ld g=%ld s=%d jfq=%ld j=%x ", - rsp->name, ulong2long(rsp->completed), ulong2long(gpnum), + seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ", + ulong2long(rsp->completed), ulong2long(gpnum), rsp->fqs_state, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff)); @@ -294,44 +253,22 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) seq_puts(m, "\n"); } -static int show_rcuhier(struct seq_file *m, void *unused) +static int show_rcuhier(struct seq_file *m, void *v) { - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) - print_one_rcu_state(m, rsp); + struct rcu_state *rsp = (struct rcu_state *)m->private; + print_one_rcu_state(m, rsp); return 0; } static int rcuhier_open(struct inode *inode, struct file *file) { - return single_open(file, show_rcuhier, NULL); + return single_open(file, show_rcuhier, inode->i_private); } static const struct file_operations rcuhier_fops = { .owner = THIS_MODULE, .open = rcuhier_open, .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int new_show_rcuhier(struct seq_file *m, void *v) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - print_one_rcu_state(m, rsp); - return 0; -} - -static int new_rcuhier_open(struct inode *inode, struct file *file) -{ - return single_open(file, new_show_rcuhier, inode->i_private); -} - -static const struct file_operations new_rcuhier_fops = { - .owner = THIS_MODULE, - .open = new_rcuhier_open, - .read = seq_read, .llseek = no_llseek, .release = seq_release, }; @@ -354,48 +291,26 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) gpage = jiffies - rsp->gp_start; gpmax = rsp->gp_max; raw_spin_unlock_irqrestore(&rnp->lock, flags); - seq_printf(m, "%s: completed=%ld gpnum=%ld age=%ld max=%ld\n", - rsp->name, ulong2long(completed), ulong2long(gpnum), gpage, gpmax); + seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n", + ulong2long(completed), ulong2long(gpnum), gpage, gpmax); } -static int show_rcugp(struct seq_file *m, void *unused) +static int show_rcugp(struct seq_file *m, void *v) { - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) - show_one_rcugp(m, rsp); + struct rcu_state *rsp = (struct rcu_state *)m->private; + show_one_rcugp(m, rsp); return 0; } static int rcugp_open(struct inode *inode, struct file *file) { - return single_open(file, show_rcugp, NULL); + return single_open(file, show_rcugp, inode->i_private); } static const struct file_operations rcugp_fops = { .owner = THIS_MODULE, .open = rcugp_open, .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int new_show_rcugp(struct seq_file *m, void *v) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - show_one_rcugp(m, rsp); - return 0; -} - -static int new_rcugp_open(struct inode *inode, struct file *file) -{ - return single_open(file, new_show_rcugp, inode->i_private); -} - -static const struct file_operations new_rcugp_fops = { - .owner = THIS_MODULE, - .open = new_rcugp_open, - .read = seq_read, .llseek = no_llseek, .release = seq_release, }; @@ -485,57 +400,40 @@ static int __init rcutree_trace_init(void) if (!rspdir) goto free_out; - retval = debugfs_create_file("rcudata", 0444, - rspdir, rsp, &rcudata_fops); - if (!retval) - goto free_out; + retval = debugfs_create_file("rcudata", 0444, + rspdir, rsp, &rcudata_fops); + if (!retval) + goto free_out; - retval = debugfs_create_file("rcu_pending", 0444, - rspdir, rsp, &rcu_pending_fops); - if (!retval) - goto free_out; + retval = debugfs_create_file("rcu_pending", 0444, + rspdir, rsp, &rcu_pending_fops); + if (!retval) + goto free_out; - retval = debugfs_create_file("rcubarrier", 0444, - rspdir, rsp, &new_rcubarrier_fops); - if (!retval) - goto free_out; + retval = debugfs_create_file("rcubarrier", 0444, + rspdir, rsp, &rcubarrier_fops); + if (!retval) + goto free_out; #ifdef CONFIG_RCU_BOOST - if (rsp == &rcu_preempt_state) { - retval = debugfs_create_file("rcuboost", 0444, - rspdir, NULL, &rcu_node_boost_fops); - if (!retval) - goto free_out; - } -#endif - - retval = debugfs_create_file("rcugp", 0444, - rspdir, rsp, &new_rcugp_fops); + if (rsp == &rcu_preempt_state) { + retval = debugfs_create_file("rcuboost", 0444, + rspdir, NULL, &rcu_node_boost_fops); if (!retval) goto free_out; + } +#endif - retval = debugfs_create_file("rcuhier", 0444, - rspdir, rsp, &new_rcuhier_fops); - if (!retval) - goto free_out; - } - - retval = debugfs_create_file("rcubarrier", 0444, rcudir, - NULL, &rcubarrier_fops); - if (!retval) - goto free_out; - - if (rcu_boost_trace_create_file(rcudir)) - goto free_out; - - retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); - if (!retval) - goto free_out; + retval = debugfs_create_file("rcugp", 0444, + rspdir, rsp, &rcugp_fops); + if (!retval) + goto free_out; - retval = debugfs_create_file("rcuhier", 0444, rcudir, - NULL, &rcuhier_fops); - if (!retval) - goto free_out; + retval = debugfs_create_file("rcuhier", 0444, + rspdir, rsp, &rcuhier_fops); + if (!retval) + goto free_out; + } retval = debugfs_create_file("rcutorture", 0444, rcudir, NULL, &rcutorture_fops); From 7bd8f2a74bcbd39f4277766f4d49441c45dd10a0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Oct 2012 17:14:32 -0700 Subject: [PATCH 37/49] rcu: Add tracing for synchronize_sched_expedited() This commit adds a per-RCU-flavor "rcuexp" file that dumps out statistics for synchonize_sched_expedited(). Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 967115c508bc40..f9512687a6e529 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -173,6 +173,38 @@ static const struct file_operations rcudata_fops = { .release = seq_release, }; +static int show_rcuexp(struct seq_file *m, void *v) +{ + struct rcu_state *rsp = (struct rcu_state *)m->private; + + seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n", + atomic_long_read(&rsp->expedited_start), + atomic_long_read(&rsp->expedited_done), + atomic_long_read(&rsp->expedited_wrap), + atomic_long_read(&rsp->expedited_tryfail), + atomic_long_read(&rsp->expedited_workdone1), + atomic_long_read(&rsp->expedited_workdone2), + atomic_long_read(&rsp->expedited_normal), + atomic_long_read(&rsp->expedited_stoppedcpus), + atomic_long_read(&rsp->expedited_done_tries), + atomic_long_read(&rsp->expedited_done_lost), + atomic_long_read(&rsp->expedited_done_exit)); + return 0; +} + +static int rcuexp_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcuexp, inode->i_private); +} + +static const struct file_operations rcuexp_fops = { + .owner = THIS_MODULE, + .open = rcuexp_open, + .read = seq_read, + .llseek = no_llseek, + .release = seq_release, +}; + #ifdef CONFIG_RCU_BOOST static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) @@ -405,6 +437,11 @@ static int __init rcutree_trace_init(void) if (!retval) goto free_out; + retval = debugfs_create_file("rcuexp", 0444, + rspdir, rsp, &rcuexp_fops); + if (!retval) + goto free_out; + retval = debugfs_create_file("rcu_pending", 0444, rspdir, rsp, &rcu_pending_fops); if (!retval) From 878d7439d0f45a95869e417576774673d1fa243f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Oct 2012 04:55:36 -0700 Subject: [PATCH 38/49] rcu: Fix batch-limit size problem Commit 29c00b4a1d9e27 (rcu: Add event-tracing for RCU callback invocation) added a regression in rcu_do_batch() Under stress, RCU is supposed to allow to process all items in queue, instead of a batch of 10 items (blimit), but an integer overflow makes the effective limit being 1. So, unless there is frequent idle periods (during which RCU ignores batch limits), RCU can be forced into a state where it cannot keep up with the callback-generation rate, eventually resulting in OOM. This commit therefore converts a few variables in rcu_do_batch() from int to long to fix this problem, along with the module parameters controlling the batch limits. Signed-off-by: Eric Dumazet Signed-off-by: Paul E. McKenney Cc: # 3.2 + --- kernel/rcutree.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 93d6871bf7f960..e4c2192b47c8ce 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -212,13 +212,13 @@ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { #endif }; -static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ -static int qhimark = 10000; /* If this many pending, ignore blimit. */ -static int qlowmark = 100; /* Once only this many pending, use blimit. */ +static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ +static long qhimark = 10000; /* If this many pending, ignore blimit. */ +static long qlowmark = 100; /* Once only this many pending, use blimit. */ -module_param(blimit, int, 0444); -module_param(qhimark, int, 0444); -module_param(qlowmark, int, 0444); +module_param(blimit, long, 0444); +module_param(qhimark, long, 0444); +module_param(qlowmark, long, 0444); int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; @@ -1791,7 +1791,8 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_head *next, *list, **tail; - int bl, count, count_lazy, i; + long bl, count, count_lazy; + int i; /* If no callbacks are ready, just return.*/ if (!cpu_has_callbacks_ready_to_invoke(rdp)) { From bb08f76d8419a163b7a4212a0e4ea6bd25acacd1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 20 Oct 2012 12:33:37 -0700 Subject: [PATCH 39/49] rcu: Remove list_for_each_continue_rcu() The list_for_each_continue_rcu() macro is no longer used, so this commit removes it. The list_for_each_entry_continue_rcu() macro should be used instead. Signed-off-by: Paul E. McKenney --- Documentation/RCU/checklist.txt | 17 ++++++++--------- Documentation/RCU/whatisRCU.txt | 4 +--- include/linux/rculist.h | 17 ----------------- 3 files changed, 9 insertions(+), 29 deletions(-) diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index cdb20d41a44ad8..31ef8fe07f8280 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt @@ -271,15 +271,14 @@ over a rather long period of time, but improvements are always welcome! The same cautions apply to call_rcu_bh() and call_rcu_sched(). 9. All RCU list-traversal primitives, which include - rcu_dereference(), list_for_each_entry_rcu(), - list_for_each_continue_rcu(), and list_for_each_safe_rcu(), - must be either within an RCU read-side critical section or - must be protected by appropriate update-side locks. RCU - read-side critical sections are delimited by rcu_read_lock() - and rcu_read_unlock(), or by similar primitives such as - rcu_read_lock_bh() and rcu_read_unlock_bh(), in which case - the matching rcu_dereference() primitive must be used in order - to keep lockdep happy, in this case, rcu_dereference_bh(). + rcu_dereference(), list_for_each_entry_rcu(), and + list_for_each_safe_rcu(), must be either within an RCU read-side + critical section or must be protected by appropriate update-side + locks. RCU read-side critical sections are delimited by + rcu_read_lock() and rcu_read_unlock(), or by similar primitives + such as rcu_read_lock_bh() and rcu_read_unlock_bh(), in which + case the matching rcu_dereference() primitive must be used in + order to keep lockdep happy, in this case, rcu_dereference_bh(). The reason that it is permissible to use RCU list-traversal primitives when the update-side lock is held is that doing so diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index bf0f6de2aa00c8..9d30de00d730c6 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -789,9 +789,7 @@ RCU list traversal: list_for_each_entry_rcu hlist_for_each_entry_rcu hlist_nulls_for_each_entry_rcu - - list_for_each_continue_rcu (to be deprecated in favor of new - list_for_each_entry_continue_rcu) + list_for_each_entry_continue_rcu RCU pointer/list update: diff --git a/include/linux/rculist.h b/include/linux/rculist.h index e0f0fab204154f..c92dd28eaa6c50 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -286,23 +286,6 @@ static inline void list_splice_init_rcu(struct list_head *list, &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) - -/** - * list_for_each_continue_rcu - * @pos: the &struct list_head to use as a loop cursor. - * @head: the head for your list. - * - * Iterate over an rcu-protected list, continuing after current point. - * - * This list-traversal primitive may safely run concurrently with - * the _rcu list-mutation primitives such as list_add_rcu() - * as long as the traversal is guarded by rcu_read_lock(). - */ -#define list_for_each_continue_rcu(pos, head) \ - for ((pos) = rcu_dereference_raw(list_next_rcu(pos)); \ - (pos) != (head); \ - (pos) = rcu_dereference_raw(list_next_rcu(pos))) - /** * list_for_each_entry_continue_rcu - continue iteration over list of given type * @pos: the type * to use as a loop cursor. From 67afeed2cab0e59712b4ebf1aef9a2e555a188ce Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 20 Oct 2012 12:56:06 -0700 Subject: [PATCH 40/49] rcu: Add new rcutorture module parameters to start/end test messages Several new rcutorture module parameters have been added, but are not printed to the console at the beginning and end of tests, which makes it difficult to reproduce a prior test. This commit therefore adds these new module parameters to the list printed at the beginning and the end of the tests. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index aaa7b9f3532a86..7fa184f6a48fe0 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -1396,12 +1396,16 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " "test_boost=%d/%d test_boost_interval=%d " "test_boost_duration=%d shutdown_secs=%d " + "stall_cpu=%d stall_cpu_holdoff=%d " + "n_barrier_cbs=%d " "onoff_interval=%d onoff_holdoff=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, test_boost, cur_ops->can_boost, test_boost_interval, test_boost_duration, shutdown_secs, + stall_cpu, stall_cpu_holdoff, + n_barrier_cbs, onoff_interval, onoff_holdoff); } From f0a0e6f282c72247e7c8ec17c68d528c1bb4d49e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Oct 2012 13:47:01 -0700 Subject: [PATCH 41/49] rcu: Clarify memory-ordering properties of grace-period primitives This commit explicitly states the memory-ordering properties of the RCU grace-period primitives. Although these properties were in some sense implied by the fundmental property of RCU ("a grace period must wait for all pre-existing RCU read-side critical sections to complete"), stating it explicitly will be a great labor-saving device. Reported-by: Oleg Nesterov Signed-off-by: Paul E. McKenney Reviewed-by: Oleg Nesterov --- include/linux/rcupdate.h | 25 +++++++++++++++++++++++++ kernel/rcutree.c | 29 +++++++++++++++++++++++++---- kernel/rcutree_plugin.h | 8 ++++++++ 3 files changed, 58 insertions(+), 4 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 7c968e4f929ea4..6256759fb81ef0 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -90,6 +90,25 @@ extern void do_trace_rcu_torture_read(char *rcutorturename, * that started after call_rcu() was invoked. RCU read-side critical * sections are delimited by rcu_read_lock() and rcu_read_unlock(), * and may be nested. + * + * Note that all CPUs must agree that the grace period extended beyond + * all pre-existing RCU read-side critical section. On systems with more + * than one CPU, this means that when "func()" is invoked, each CPU is + * guaranteed to have executed a full memory barrier since the end of its + * last RCU read-side critical section whose beginning preceded the call + * to call_rcu(). It also means that each CPU executing an RCU read-side + * critical section that continues beyond the start of "func()" must have + * executed a memory barrier after the call_rcu() but before the beginning + * of that RCU read-side critical section. Note that these guarantees + * include CPUs that are offline, idle, or executing in user mode, as + * well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the + * resulting RCU callback function "func()", then both CPU A and CPU B are + * guaranteed to execute a full memory barrier during the time interval + * between the call to call_rcu() and the invocation of "func()" -- even + * if CPU A and CPU B are the same CPU (but again only if the system has + * more than one CPU). */ extern void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *head)); @@ -118,6 +137,9 @@ extern void call_rcu(struct rcu_head *head, * OR * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. * These may be nested. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. */ extern void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *head)); @@ -137,6 +159,9 @@ extern void call_rcu_bh(struct rcu_head *head, * OR * anything that disables preemption. * These may be nested. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. */ extern void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e4c2192b47c8ce..15a2beec320fe1 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2228,10 +2228,28 @@ static inline int rcu_blocking_is_gp(void) * rcu_read_lock_sched(). * * This means that all preempt_disable code sequences, including NMI and - * hardware-interrupt handlers, in progress on entry will have completed - * before this primitive returns. However, this does not guarantee that - * softirq handlers will have completed, since in some kernels, these - * handlers can run in process context, and can block. + * non-threaded hardware-interrupt handlers, in progress on entry will + * have completed before this primitive returns. However, this does not + * guarantee that softirq handlers will have completed, since in some + * kernels, these handlers can run in process context, and can block. + * + * Note that this guarantee implies further memory-ordering guarantees. + * On systems with more than one CPU, when synchronize_sched() returns, + * each CPU is guaranteed to have executed a full memory barrier since the + * end of its last RCU-sched read-side critical section whose beginning + * preceded the call to synchronize_sched(). In addition, each CPU having + * an RCU read-side critical section that extends beyond the return from + * synchronize_sched() is guaranteed to have executed a full memory barrier + * after the beginning of synchronize_sched() and before the beginning of + * that RCU read-side critical section. Note that these guarantees include + * CPUs that are offline, idle, or executing in user mode, as well as CPUs + * that are executing in the kernel. + * + * Furthermore, if CPU A invoked synchronize_sched(), which returned + * to its caller on CPU B, then both CPU A and CPU B are guaranteed + * to have executed a full memory barrier during the execution of + * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but + * again only if the system has more than one CPU). * * This primitive provides the guarantees made by the (now removed) * synchronize_kernel() API. In contrast, synchronize_rcu() only @@ -2259,6 +2277,9 @@ EXPORT_SYMBOL_GPL(synchronize_sched); * read-side critical sections have completed. RCU read-side critical * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), * and may be nested. + * + * See the description of synchronize_sched() for more detailed information + * on memory ordering guarantees. */ void synchronize_rcu_bh(void) { diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index f921154881870b..57e0ef8ed721ea 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -670,6 +670,9 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu); * concurrently with new RCU read-side critical sections that began while * synchronize_rcu() was waiting. RCU read-side critical sections are * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. + * + * See the description of synchronize_sched() for more detailed information + * on memory ordering guarantees. */ void synchronize_rcu(void) { @@ -875,6 +878,11 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); /** * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. + * + * Note that this primitive does not necessarily wait for an RCU grace period + * to complete. For example, if there are no RCU callbacks queued anywhere + * in the system, then rcu_barrier() is within its rights to return + * immediately, without waiting for anything, much less an RCU grace period. */ void rcu_barrier(void) { From 351573a86d0ef17cbba1c5436706602692781bfe Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Oct 2012 04:52:56 -0700 Subject: [PATCH 42/49] rcu: Fix TINY_RCU rcu_is_cpu_rrupt_from_idle check The rcu_is_cpu_rrupt_from_idle() needs to allow for one interrupt level from the idle loop, but TINY_RCU checks for a call from the idle loop itself. This commit fixes this issue. Reported-by: Josh Triplett Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutiny.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index e4c6a598d6f749..e7dce58f9c2aa3 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -195,7 +195,7 @@ EXPORT_SYMBOL(rcu_is_cpu_idle); */ int rcu_is_cpu_rrupt_from_idle(void) { - return rcu_dynticks_nesting <= 0; + return rcu_dynticks_nesting <= 1; } /* From c896054f75f9a720ecf2ab3e688f4da79a55fe05 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 Oct 2012 17:59:23 -0700 Subject: [PATCH 43/49] rcu: Reduce default RCU CPU stall warning timeout The RCU CPU stall warning timeout has defaulted to 60 seconds for some years, with almost no false positives. This commit therefore reduces the default to 21 seconds, slightly shorter than the new soft-lockup timeout. Signed-off-by: Paul E. McKenney --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 28e9d6c9894129..41faf0b8df1df9 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -972,7 +972,7 @@ config RCU_CPU_STALL_TIMEOUT int "RCU CPU stall timeout in seconds" depends on TREE_RCU || TREE_PREEMPT_RCU range 3 300 - default 60 + default 21 help If a given RCU grace period extends more than the specified number of seconds, a CPU stall warning is printed. If the From 40e80c469f1b52a68e09da3808a1228cf9947fa7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 31 Oct 2012 13:00:15 -0700 Subject: [PATCH 44/49] rcu: Update documentation for TREE_RCU debugfs tracing This commit updates the tracing documentation to reflect the new format that has per-RCU-flavor directories. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- Documentation/RCU/trace.txt | 365 ++++++++++++++++++------------------ 1 file changed, 182 insertions(+), 183 deletions(-) diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index 672d1908325249..79ce9891a8c734 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -10,51 +10,61 @@ for rcutree and next for rcutiny. CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats -These implementations of RCU provides several debugfs files under the +These implementations of RCU provide several debugfs directories under the top-level directory "rcu": -rcu/rcudata: +rcu/rcu_bh +rcu/rcu_preempt +rcu/rcu_sched + +Each directory contains files for the corresponding flavor of RCU. +Note that rcu/rcu_preempt is only present for CONFIG_TREE_PREEMPT_RCU. +For CONFIG_TREE_RCU, the RCU flavor maps onto the RCU-sched flavor, +so that activity for both appears in rcu/rcu_sched. + +In addition, the following file appears in the top-level directory: +rcu/rcutorture. This file displays rcutorture test progress. The output +of "cat rcu/rcutorture" looks as follows: + +rcutorture test sequence: 0 (test in progress) +rcutorture update version number: 615 + +The first line shows the number of rcutorture tests that have completed +since boot. If a test is currently running, the "(test in progress)" +string will appear as shown above. The second line shows the number of +update cycles that the current test has started, or zero if there is +no test in progress. + + +Within each flavor directory (rcu/rcu_bh, rcu/rcu_sched, and possibly +also rcu/rcu_preempt) the following files will be present: + +rcudata: Displays fields in struct rcu_data. -rcu/rcudata.csv: - Comma-separated values spreadsheet version of rcudata. -rcu/rcugp: +rcugp: Displays grace-period counters. -rcu/rcuhier: +rcuhier: Displays the struct rcu_node hierarchy. -rcu/rcu_pending: +rcu_pending: Displays counts of the reasons rcu_pending() decided that RCU had work to do. -rcu/rcutorture: - Displays rcutorture test progress. -rcu/rcuboost: +rcuboost: Displays RCU boosting statistics. Only present if CONFIG_RCU_BOOST=y. -The output of "cat rcu/rcudata" looks as follows: - -rcu_sched: - 0 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=545/1/0 df=50 of=0 ql=163 qs=NRW. kt=0/W/0 ktl=ebc3 b=10 ci=153737 co=0 ca=0 - 1 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=967/1/0 df=58 of=0 ql=634 qs=NRW. kt=0/W/1 ktl=58c b=10 ci=191037 co=0 ca=0 - 2 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=1081/1/0 df=175 of=0 ql=74 qs=N.W. kt=0/W/2 ktl=da94 b=10 ci=75991 co=0 ca=0 - 3 c=20942 g=20943 pq=1 pgp=20942 qp=1 dt=1846/0/0 df=404 of=0 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=72261 co=0 ca=0 - 4 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=369/1/0 df=83 of=0 ql=48 qs=N.W. kt=0/W/4 ktl=e0e7 b=10 ci=128365 co=0 ca=0 - 5 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=381/1/0 df=64 of=0 ql=169 qs=NRW. kt=0/W/5 ktl=fb2f b=10 ci=164360 co=0 ca=0 - 6 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=1037/1/0 df=183 of=0 ql=62 qs=N.W. kt=0/W/6 ktl=d2ad b=10 ci=65663 co=0 ca=0 - 7 c=20897 g=20897 pq=1 pgp=20896 qp=0 dt=1572/0/0 df=382 of=0 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=75006 co=0 ca=0 -rcu_bh: - 0 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=545/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/0 ktl=ebc3 b=10 ci=0 co=0 ca=0 - 1 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=967/1/0 df=3 of=0 ql=0 qs=.... kt=0/W/1 ktl=58c b=10 ci=151 co=0 ca=0 - 2 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1081/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/2 ktl=da94 b=10 ci=0 co=0 ca=0 - 3 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1846/0/0 df=8 of=0 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=0 co=0 ca=0 - 4 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=369/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/4 ktl=e0e7 b=10 ci=0 co=0 ca=0 - 5 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=381/1/0 df=4 of=0 ql=0 qs=.... kt=0/W/5 ktl=fb2f b=10 ci=0 co=0 ca=0 - 6 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1037/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/6 ktl=d2ad b=10 ci=0 co=0 ca=0 - 7 c=1474 g=1474 pq=1 pgp=1473 qp=0 dt=1572/0/0 df=8 of=0 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=0 co=0 ca=0 - -The first section lists the rcu_data structures for rcu_sched, the second -for rcu_bh. Note that CONFIG_TREE_PREEMPT_RCU kernels will have an -additional section for rcu_preempt. Each section has one line per CPU, -or eight for this 8-CPU system. The fields are as follows: +The output of "cat rcu/rcu_preempt/rcudata" looks as follows: + + 0!c=30455 g=30456 pq=1 qp=1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716 + 1!c=30719 g=30720 pq=1 qp=0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982 + 2!c=30150 g=30151 pq=1 qp=1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458 + 3 c=31249 g=31250 pq=1 qp=0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622 + 4!c=29502 g=29503 pq=1 qp=1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521 + 5 c=31201 g=31202 pq=1 qp=1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698 + 6!c=30253 g=30254 pq=1 qp=1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353 + 7 c=31178 g=31178 pq=1 qp=0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969 + +This file has one line per CPU, or eight for this 8-CPU system. +The fields are as follows: o The number at the beginning of each line is the CPU number. CPUs numbers followed by an exclamation mark are offline, @@ -64,11 +74,13 @@ o The number at the beginning of each line is the CPU number. substantially larger than the number of actual CPUs. o "c" is the count of grace periods that this CPU believes have - completed. Offlined CPUs and CPUs in dynticks idle mode may - lag quite a ways behind, for example, CPU 6 under "rcu_sched" - above, which has been offline through not quite 40,000 RCU grace - periods. It is not unusual to see CPUs lagging by thousands of - grace periods. + completed. Offlined CPUs and CPUs in dynticks idle mode may lag + quite a ways behind, for example, CPU 4 under "rcu_sched" above, + which has been offline through 16 RCU grace periods. It is not + unusual to see offline CPUs lagging by thousands of grace periods. + Note that although the grace-period number is an unsigned long, + it is printed out as a signed long to allow more human-friendly + representation near boot time. o "g" is the count of grace periods that this CPU believes have started. Again, offlined CPUs and CPUs in dynticks idle mode @@ -84,30 +96,25 @@ o "pq" indicates that this CPU has passed through a quiescent state CPU has not yet reported that fact, (2) some other CPU has not yet reported for this grace period, or (3) both. -o "pgp" indicates which grace period the last-observed quiescent - state for this CPU corresponds to. This is important for handling - the race between CPU 0 reporting an extended dynticks-idle - quiescent state for CPU 1 and CPU 1 suddenly waking up and - reporting its own quiescent state. If CPU 1 was the last CPU - for the current grace period, then the CPU that loses this race - will attempt to incorrectly mark CPU 1 as having checked in for - the next grace period! - o "qp" indicates that RCU still expects a quiescent state from this CPU. Offlined CPUs and CPUs in dyntick idle mode might well have qp=1, which is OK: RCU is still ignoring them. o "dt" is the current value of the dyntick counter that is incremented - when entering or leaving dynticks idle state, either by the - scheduler or by irq. This number is even if the CPU is in - dyntick idle mode and odd otherwise. The number after the first - "/" is the interrupt nesting depth when in dyntick-idle state, - or one greater than the interrupt-nesting depth otherwise. - The number after the second "/" is the NMI nesting depth. + when entering or leaving idle, either due to a context switch or + due to an interrupt. This number is even if the CPU is in idle + from RCU's viewpoint and odd otherwise. The number after the + first "/" is the interrupt nesting depth when in idle state, + or a large number added to the interrupt-nesting depth when + running a non-idle task. Some architectures do not accurately + count interrupt nesting when running in non-idle kernel context, + which can result in interesting anomalies such as negative + interrupt-nesting levels. The number after the second "/" + is the NMI nesting depth. o "df" is the number of times that some other CPU has forced a quiescent state on behalf of this CPU due to this CPU being in - dynticks-idle state. + idle state. o "of" is the number of times that some other CPU has forced a quiescent state on behalf of this CPU due to this CPU being @@ -120,9 +127,13 @@ o "of" is the number of times that some other CPU has forced a error, so it makes sense to err conservatively. o "ql" is the number of RCU callbacks currently residing on - this CPU. This is the total number of callbacks, regardless - of what state they are in (new, waiting for grace period to - start, waiting for grace period to end, ready to invoke). + this CPU. The first number is the number of "lazy" callbacks + that are known to RCU to only be freeing memory, and the number + after the "/" is the total number of callbacks, lazy or not. + These counters count callbacks regardless of what phase of + grace-period processing that they are in (new, waiting for + grace period to start, waiting for grace period to end, ready + to invoke). o "qs" gives an indication of the state of the callback queue with four characters: @@ -150,6 +161,43 @@ o "qs" gives an indication of the state of the callback queue If there are no callbacks in a given one of the above states, the corresponding character is replaced by ".". +o "b" is the batch limit for this CPU. If more than this number + of RCU callbacks is ready to invoke, then the remainder will + be deferred. + +o "ci" is the number of RCU callbacks that have been invoked for + this CPU. Note that ci+nci+ql is the number of callbacks that have + been registered in absence of CPU-hotplug activity. + +o "nci" is the number of RCU callbacks that have been offloaded from + this CPU. This will always be zero unless the kernel was built + with CONFIG_RCU_NOCB_CPU=y and the "rcu_nocbs=" kernel boot + parameter was specified. + +o "co" is the number of RCU callbacks that have been orphaned due to + this CPU going offline. These orphaned callbacks have been moved + to an arbitrarily chosen online CPU. + +o "ca" is the number of RCU callbacks that have been adopted by this + CPU due to other CPUs going offline. Note that ci+co-ca+ql is + the number of RCU callbacks registered on this CPU. + + +Kernels compiled with CONFIG_RCU_BOOST=y display the following from +/debug/rcu/rcu_preempt/rcudata: + + 0!c=12865 g=12866 pq=1 qp=1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871 + 1 c=14407 g=14408 pq=1 qp=0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485 + 2 c=14407 g=14408 pq=1 qp=0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490 + 3 c=14407 g=14408 pq=1 qp=0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290 + 4 c=14405 g=14406 pq=1 qp=1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114 + 5!c=14168 g=14169 pq=1 qp=0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722 + 6 c=14404 g=14405 pq=1 qp=0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811 + 7 c=14407 g=14408 pq=1 qp=1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042 + +This is similar to the output discussed above, but contains the following +additional fields: + o "kt" is the per-CPU kernel-thread state. The digit preceding the first slash is zero if there is no work pending and 1 otherwise. The character between the first pair of slashes is @@ -184,35 +232,12 @@ o "ktl" is the low-order 16 bits (in hexadecimal) of the count of This field is displayed only for CONFIG_RCU_BOOST kernels. -o "b" is the batch limit for this CPU. If more than this number - of RCU callbacks is ready to invoke, then the remainder will - be deferred. - -o "ci" is the number of RCU callbacks that have been invoked for - this CPU. Note that ci+ql is the number of callbacks that have - been registered in absence of CPU-hotplug activity. - -o "co" is the number of RCU callbacks that have been orphaned due to - this CPU going offline. These orphaned callbacks have been moved - to an arbitrarily chosen online CPU. - -o "ca" is the number of RCU callbacks that have been adopted due to - other CPUs going offline. Note that ci+co-ca+ql is the number of - RCU callbacks registered on this CPU. -There is also an rcu/rcudata.csv file with the same information in -comma-separated-variable spreadsheet format. +The output of "cat rcu/rcu_preempt/rcugp" looks as follows: +completed=31249 gpnum=31250 age=1 max=18 -The output of "cat rcu/rcugp" looks as follows: - -rcu_sched: completed=33062 gpnum=33063 -rcu_bh: completed=464 gpnum=464 - -Again, this output is for both "rcu_sched" and "rcu_bh". Note that -kernels built with CONFIG_TREE_PREEMPT_RCU will have an additional -"rcu_preempt" line. The fields are taken from the rcu_state structure, -and are as follows: +These fields are taken from the rcu_state structure, and are as follows: o "completed" is the number of grace periods that have completed. It is comparable to the "c" field from rcu/rcudata in that a @@ -220,44 +245,42 @@ o "completed" is the number of grace periods that have completed. that the corresponding RCU grace period has completed. o "gpnum" is the number of grace periods that have started. It is - comparable to the "g" field from rcu/rcudata in that a CPU - whose "g" field matches the value of "gpnum" is aware that the - corresponding RCU grace period has started. + similarly comparable to the "g" field from rcu/rcudata in that + a CPU whose "g" field matches the value of "gpnum" is aware that + the corresponding RCU grace period has started. - If these two fields are equal (as they are for "rcu_bh" above), - then there is no grace period in progress, in other words, RCU - is idle. On the other hand, if the two fields differ (as they - do for "rcu_sched" above), then an RCU grace period is in progress. + If these two fields are equal, then there is no grace period + in progress, in other words, RCU is idle. On the other hand, + if the two fields differ (as they are above), then an RCU grace + period is in progress. +o "age" is the number of jiffies that the current grace period + has extended for, or zero if there is no grace period currently + in effect. -The output of "cat rcu/rcuhier" looks as follows, with very long lines: +o "max" is the age in jiffies of the longest-duration grace period + thus far. -c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 -1/1 ..>. 0:127 ^0 -3/3 ..>. 0:35 ^0 0/0 ..>. 36:71 ^1 0/0 ..>. 72:107 ^2 0/0 ..>. 108:127 ^3 -3/3f ..>. 0:5 ^0 2/3 ..>. 6:11 ^1 0/0 ..>. 12:17 ^2 0/0 ..>. 18:23 ^3 0/0 ..>. 24:29 ^4 0/0 ..>. 30:35 ^5 0/0 ..>. 36:41 ^0 0/0 ..>. 42:47 ^1 0/0 ..>. 48:53 ^2 0/0 ..>. 54:59 ^3 0/0 ..>. 60:65 ^4 0/0 ..>. 66:71 ^5 0/0 ..>. 72:77 ^0 0/0 ..>. 78:83 ^1 0/0 ..>. 84:89 ^2 0/0 ..>. 90:95 ^3 0/0 ..>. 96:101 ^4 0/0 ..>. 102:107 ^5 0/0 ..>. 108:113 ^0 0/0 ..>. 114:119 ^1 0/0 ..>. 120:125 ^2 0/0 ..>. 126:127 ^3 -rcu_bh: -c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 -0/1 ..>. 0:127 ^0 -0/3 ..>. 0:35 ^0 0/0 ..>. 36:71 ^1 0/0 ..>. 72:107 ^2 0/0 ..>. 108:127 ^3 -0/3f ..>. 0:5 ^0 0/3 ..>. 6:11 ^1 0/0 ..>. 12:17 ^2 0/0 ..>. 18:23 ^3 0/0 ..>. 24:29 ^4 0/0 ..>. 30:35 ^5 0/0 ..>. 36:41 ^0 0/0 ..>. 42:47 ^1 0/0 ..>. 48:53 ^2 0/0 ..>. 54:59 ^3 0/0 ..>. 60:65 ^4 0/0 ..>. 66:71 ^5 0/0 ..>. 72:77 ^0 0/0 ..>. 78:83 ^1 0/0 ..>. 84:89 ^2 0/0 ..>. 90:95 ^3 0/0 ..>. 96:101 ^4 0/0 ..>. 102:107 ^5 0/0 ..>. 108:113 ^0 0/0 ..>. 114:119 ^1 0/0 ..>. 120:125 ^2 0/0 ..>. 126:127 ^3 +The output of "cat rcu/rcu_preempt/rcuhier" looks as follows: -This is once again split into "rcu_sched" and "rcu_bh" portions, -and CONFIG_TREE_PREEMPT_RCU kernels will again have an additional -"rcu_preempt" section. The fields are as follows: +c=14407 g=14408 s=0 jfq=2 j=c863 nfqs=12040/nfqsng=0(12040) fqlh=1051 oqlen=0/0 +3/3 ..>. 0:7 ^0 +e/e ..>. 0:3 ^0 d/d ..>. 4:7 ^1 -o "c" is exactly the same as "completed" under rcu/rcugp. +The fields are as follows: -o "g" is exactly the same as "gpnum" under rcu/rcugp. +o "c" is exactly the same as "completed" under rcu/rcu_preempt/rcugp. -o "s" is the "signaled" state that drives force_quiescent_state()'s +o "g" is exactly the same as "gpnum" under rcu/rcu_preempt/rcugp. + +o "s" is the current state of the force_quiescent_state() state machine. o "jfq" is the number of jiffies remaining for this grace period before force_quiescent_state() is invoked to help push things - along. Note that CPUs in dyntick-idle mode throughout the grace - period will not report on their own, but rather must be check by - some other CPU via force_quiescent_state(). + along. Note that CPUs in idle mode throughout the grace period + will not report on their own, but rather must be check by some + other CPU via force_quiescent_state(). o "j" is the low-order four hex digits of the jiffies counter. Yes, Paul did run into a number of problems that turned out to @@ -268,7 +291,8 @@ o "nfqs" is the number of calls to force_quiescent_state() since o "nfqsng" is the number of useless calls to force_quiescent_state(), where there wasn't actually a grace period active. This can - happen due to races. The number in parentheses is the difference + no longer happen due to grace-period processing being pushed + into a kthread. The number in parentheses is the difference between "nfqs" and "nfqsng", or the number of times that force_quiescent_state() actually did some real work. @@ -276,28 +300,27 @@ o "fqlh" is the number of calls to force_quiescent_state() that exited immediately (without even being counted in nfqs above) due to contention on ->fqslock. -o Each element of the form "1/1 0:127 ^0" represents one struct - rcu_node. Each line represents one level of the hierarchy, from - root to leaves. It is best to think of the rcu_data structures - as forming yet another level after the leaves. Note that there - might be either one, two, or three levels of rcu_node structures, - depending on the relationship between CONFIG_RCU_FANOUT and - CONFIG_NR_CPUS. +o Each element of the form "3/3 ..>. 0:7 ^0" represents one rcu_node + structure. Each line represents one level of the hierarchy, + from root to leaves. It is best to think of the rcu_data + structures as forming yet another level after the leaves. + Note that there might be either one, two, three, or even four + levels of rcu_node structures, depending on the relationship + between CONFIG_RCU_FANOUT, CONFIG_RCU_FANOUT_LEAF (possibly + adjusted using the rcu_fanout_leaf kernel boot parameter), and + CONFIG_NR_CPUS (possibly adjusted using the nr_cpu_ids count of + possible CPUs for the booting hardware). o The numbers separated by the "/" are the qsmask followed by the qsmaskinit. The qsmask will have one bit - set for each entity in the next lower level that - has not yet checked in for the current grace period. + set for each entity in the next lower level that has + not yet checked in for the current grace period ("e" + indicating CPUs 5, 6, and 7 in the example above). The qsmaskinit will have one bit for each entity that is currently expected to check in during each grace period. The value of qsmaskinit is assigned to that of qsmask at the beginning of each grace period. - For example, for "rcu_sched", the qsmask of the first - entry of the lowest level is 0x14, meaning that we - are still waiting for CPUs 2 and 4 to check in for the - current grace period. - o The characters separated by the ">" indicate the state of the blocked-tasks lists. A "G" preceding the ">" indicates that at least one task blocked in an RCU @@ -312,48 +335,39 @@ o Each element of the form "1/1 0:127 ^0" represents one struct A "." character appears if the corresponding condition does not hold, so that "..>." indicates that no tasks are blocked. In contrast, "GE>T" indicates maximal - inconvenience from blocked tasks. + inconvenience from blocked tasks. CONFIG_TREE_RCU + builds of the kernel will always show "..>.". o The numbers separated by the ":" are the range of CPUs served by this struct rcu_node. This can be helpful in working out how the hierarchy is wired together. - For example, the first entry at the lowest level shows - "0:5", indicating that it covers CPUs 0 through 5. + For example, the example rcu_node structure shown above + has "0:7", indicating that it covers CPUs 0 through 7. o The number after the "^" indicates the bit in the - next higher level rcu_node structure that this - rcu_node structure corresponds to. - - For example, the first entry at the lowest level shows - "^0", indicating that it corresponds to bit zero in - the first entry at the middle level. - - -The output of "cat rcu/rcu_pending" looks as follows: - -rcu_sched: - 0 np=255892 qsp=53936 rpq=85 cbr=0 cng=14417 gpc=10033 gps=24320 nn=146741 - 1 np=261224 qsp=54638 rpq=33 cbr=0 cng=25723 gpc=16310 gps=2849 nn=155792 - 2 np=237496 qsp=49664 rpq=23 cbr=0 cng=2762 gpc=45478 gps=1762 nn=136629 - 3 np=236249 qsp=48766 rpq=98 cbr=0 cng=286 gpc=48049 gps=1218 nn=137723 - 4 np=221310 qsp=46850 rpq=7 cbr=0 cng=26 gpc=43161 gps=4634 nn=123110 - 5 np=237332 qsp=48449 rpq=9 cbr=0 cng=54 gpc=47920 gps=3252 nn=137456 - 6 np=219995 qsp=46718 rpq=12 cbr=0 cng=50 gpc=42098 gps=6093 nn=120834 - 7 np=249893 qsp=49390 rpq=42 cbr=0 cng=72 gpc=38400 gps=17102 nn=144888 -rcu_bh: - 0 np=146741 qsp=1419 rpq=6 cbr=0 cng=6 gpc=0 gps=0 nn=145314 - 1 np=155792 qsp=12597 rpq=3 cbr=0 cng=0 gpc=4 gps=8 nn=143180 - 2 np=136629 qsp=18680 rpq=1 cbr=0 cng=0 gpc=7 gps=6 nn=117936 - 3 np=137723 qsp=2843 rpq=0 cbr=0 cng=0 gpc=10 gps=7 nn=134863 - 4 np=123110 qsp=12433 rpq=0 cbr=0 cng=0 gpc=4 gps=2 nn=110671 - 5 np=137456 qsp=4210 rpq=1 cbr=0 cng=0 gpc=6 gps=5 nn=133235 - 6 np=120834 qsp=9902 rpq=2 cbr=0 cng=0 gpc=6 gps=3 nn=110921 - 7 np=144888 qsp=26336 rpq=0 cbr=0 cng=0 gpc=8 gps=2 nn=118542 - -As always, this is once again split into "rcu_sched" and "rcu_bh" -portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional -"rcu_preempt" section. The fields are as follows: + next higher level rcu_node structure that this rcu_node + structure corresponds to. For example, the "d/d ..>. 4:7 + ^1" has a "1" in this position, indicating that it + corresponds to the "1" bit in the "3" shown in the + "3/3 ..>. 0:7 ^0" entry on the next level up. + + +The output of "cat rcu/rcu_sched/rcu_pending" looks as follows: + + 0!np=26111 qsp=29 rpq=5386 cbr=1 cng=570 gpc=3674 gps=577 nn=15903 + 1!np=28913 qsp=35 rpq=6097 cbr=1 cng=448 gpc=3700 gps=554 nn=18113 + 2!np=32740 qsp=37 rpq=6202 cbr=0 cng=476 gpc=4627 gps=546 nn=20889 + 3 np=23679 qsp=22 rpq=5044 cbr=1 cng=415 gpc=3403 gps=347 nn=14469 + 4!np=30714 qsp=4 rpq=5574 cbr=0 cng=528 gpc=3931 gps=639 nn=20042 + 5 np=28910 qsp=2 rpq=5246 cbr=0 cng=428 gpc=4105 gps=709 nn=18422 + 6!np=38648 qsp=5 rpq=7076 cbr=0 cng=840 gpc=4072 gps=961 nn=25699 + 7 np=37275 qsp=2 rpq=6873 cbr=0 cng=868 gpc=3416 gps=971 nn=25147 + +The fields are as follows: + +o The leading number is the CPU number, with "!" indicating + an offline CPU. o "np" is the number of times that __rcu_pending() has been invoked for the corresponding flavor of RCU. @@ -377,38 +391,23 @@ o "gpc" is the number of times that an old grace period had o "gps" is the number of times that a new grace period had started, but this CPU was not yet aware of it. -o "nn" is the number of times that this CPU needed nothing. Alert - readers will note that the rcu "nn" number for a given CPU very - closely matches the rcu_bh "np" number for that same CPU. This - is due to short-circuit evaluation in rcu_pending(). - - -The output of "cat rcu/rcutorture" looks as follows: - -rcutorture test sequence: 0 (test in progress) -rcutorture update version number: 615 - -The first line shows the number of rcutorture tests that have completed -since boot. If a test is currently running, the "(test in progress)" -string will appear as shown above. The second line shows the number of -update cycles that the current test has started, or zero if there is -no test in progress. +o "nn" is the number of times that this CPU needed nothing. The output of "cat rcu/rcuboost" looks as follows: -0:5 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=2f95 bt=300f - balk: nt=0 egt=989 bt=0 nb=0 ny=0 nos=16 -6:7 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=2f95 bt=300f - balk: nt=0 egt=225 bt=0 nb=0 ny=0 nos=6 +0:3 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=c864 bt=c894 + balk: nt=0 egt=4695 bt=0 nb=0 ny=56 nos=0 +4:7 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=c864 bt=c894 + balk: nt=0 egt=6541 bt=0 nb=0 ny=126 nos=0 This information is output only for rcu_preempt. Each two-line entry corresponds to a leaf rcu_node strcuture. The fields are as follows: o "n:m" is the CPU-number range for the corresponding two-line entry. In the sample output above, the first entry covers - CPUs zero through five and the second entry covers CPUs 6 - and 7. + CPUs zero through three and the second entry covers CPUs four + through seven. o "tasks=TNEB" gives the state of the various segments of the rnp->blocked_tasks list: From d484a215139cf556cb718a7ec7042260b7fc2d28 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 31 Oct 2012 13:22:08 -0700 Subject: [PATCH 45/49] rcu: Add documentation for the new rcuexp debugfs trace file This commit adds the documentation of the rcuexp debugfs trace file that records statistics for expedited grace periods. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- Documentation/RCU/trace.txt | 41 +++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index 79ce9891a8c734..c776968f446300 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -41,6 +41,8 @@ also rcu/rcu_preempt) the following files will be present: rcudata: Displays fields in struct rcu_data. +rcuexp: + Displays statistics for expedited grace periods. rcugp: Displays grace-period counters. rcuhier: @@ -233,6 +235,45 @@ o "ktl" is the low-order 16 bits (in hexadecimal) of the count of This field is displayed only for CONFIG_RCU_BOOST kernels. +The output of "cat rcu/rcu_preempt/rcuexp" looks as follows: + +s=21872 d=21872 w=0 tf=0 wd1=0 wd2=0 n=0 sc=21872 dt=21872 dl=0 dx=21872 + +These fields are as follows: + +o "s" is the starting sequence number. + +o "d" is the ending sequence number. When the starting and ending + numbers differ, there is an expedited grace period in progress. + +o "w" is the number of times that the sequence numbers have been + in danger of wrapping. + +o "tf" is the number of times that contention has resulted in a + failure to begin an expedited grace period. + +o "wd1" and "wd2" are the number of times that an attempt to + start an expedited grace period found that someone else had + completed an expedited grace period that satisfies the + attempted request. "Our work is done." + +o "n" is number of times that contention was so great that + the request was demoted from an expedited grace period to + a normal grace period. + +o "sc" is the number of times that the attempt to start a + new expedited grace period succeeded. + +o "dt" is the number of times that we attempted to update + the "d" counter. + +o "dl" is the number of times that we failed to update the "d" + counter. + +o "dx" is the number of times that we succeeded in updating + the "d" counter. + + The output of "cat rcu/rcu_preempt/rcugp" looks as follows: completed=31249 gpnum=31250 age=1 max=18 From 3fbfbf7a3b66ec424042d909f14ba2ddf4372ea8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 19 Aug 2012 21:35:53 -0700 Subject: [PATCH 46/49] rcu: Add callback-free CPUs RCU callback execution can add significant OS jitter and also can degrade both scheduling latency and, in asymmetric multiprocessors, energy efficiency. This commit therefore adds the ability for selected CPUs ("rcu_nocbs=" boot parameter) to have their callbacks offloaded to kthreads. If the "rcu_nocb_poll" boot parameter is also specified, these kthreads will do polling, removing the need for the offloaded CPUs to do wakeups. At least one CPU must be doing normal callback processing: currently CPU 0 cannot be selected as a no-CBs CPU. In addition, attempts to offline the last normal-CBs CPU will fail. This feature was inspired by Jim Houston's and Joe Korty's JRCU, and this commit includes fixes to problems located by Fengguang Wu's kbuild test robot. [ paulmck: Added gfp.h include file as suggested by Fengguang Wu. ] Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- Documentation/kernel-parameters.txt | 21 ++ include/trace/events/rcu.h | 1 + init/Kconfig | 22 ++ kernel/rcutree.c | 63 ++++- kernel/rcutree.h | 47 ++++ kernel/rcutree_plugin.h | 397 +++++++++++++++++++++++++++- kernel/rcutree_trace.c | 7 +- 7 files changed, 542 insertions(+), 16 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 9776f068306b7f..9d2e5cb3a95f63 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2394,6 +2394,27 @@ bytes respectively. Such letter suffixes can also be entirely omitted. ramdisk_size= [RAM] Sizes of RAM disks in kilobytes See Documentation/blockdev/ramdisk.txt. + rcu_nocbs= [KNL,BOOT] + In kernels built with CONFIG_RCU_NOCB_CPU=y, set + the specified list of CPUs to be no-callback CPUs. + Invocation of these CPUs' RCU callbacks will + be offloaded to "rcuoN" kthreads created for + that purpose. This reduces OS jitter on the + offloaded CPUs, which can be useful for HPC and + real-time workloads. It can also improve energy + efficiency for asymmetric multiprocessors. + + rcu_nocbs_poll [KNL,BOOT] + Rather than requiring that offloaded CPUs + (specified by rcu_nocbs= above) explicitly + awaken the corresponding "rcuoN" kthreads, + make these kthreads poll for callbacks. + This improves the real-time response for the + offloaded CPUs by relieving them of the need to + wake up the corresponding kthread, but degrades + energy efficiency by requiring that the kthreads + periodically wake up to do the polling. + rcutree.blimit= [KNL,BOOT] Set maximum number of finished RCU callbacks to process in one batch. diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 5bde94d8585b77..d4f559b1ec3444 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -549,6 +549,7 @@ TRACE_EVENT(rcu_torture_read, * "EarlyExit": rcu_barrier_callback() piggybacked, thus early exit. * "Inc1": rcu_barrier_callback() piggyback check counter incremented. * "Offline": rcu_barrier_callback() found offline CPU + * "OnlineNoCB": rcu_barrier_callback() found online no-CBs CPU. * "OnlineQ": rcu_barrier_callback() found online CPU with callbacks. * "OnlineNQ": rcu_barrier_callback() found online CPU, no callbacks. * "IRQ": An rcu_barrier_callback() callback posted on remote CPU. diff --git a/init/Kconfig b/init/Kconfig index ec62139207d3cf..5ac6ee094225dc 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -654,6 +654,28 @@ config RCU_BOOST_DELAY Accept the default if unsure. +config RCU_NOCB_CPU + bool "Offload RCU callback processing from boot-selected CPUs" + depends on TREE_RCU || TREE_PREEMPT_RCU + default n + help + Use this option to reduce OS jitter for aggressive HPC or + real-time workloads. It can also be used to offload RCU + callback invocation to energy-efficient CPUs in battery-powered + asymmetric multiprocessors. + + This option offloads callback invocation from the set of + CPUs specified at boot time by the rcu_nocbs parameter. + For each such CPU, a kthread ("rcuoN") will be created to + invoke callbacks, where the "N" is the CPU being offloaded. + Nothing prevents this kthread from running on the specified + CPUs, but (1) the kthreads may be preempted between each + callback, and (2) affinity or cgroups can be used to force + the kthreads to run on whatever set of CPUs is desired. + + Say Y here if you want reduced OS jitter on selected CPUs. + Say N here if you are unsure. + endmenu # "RCU Subsystem" config IKCONFIG diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 5ffadcc3bb26e6..7733eb56e156a8 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -303,7 +303,8 @@ EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); static int cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) { - return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]; + return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] && + rdp->nxttail[RCU_DONE_TAIL] != NULL; } /* @@ -312,8 +313,11 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) static int cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { - return *rdp->nxttail[RCU_DONE_TAIL + - (ACCESS_ONCE(rsp->completed) != rdp->completed)] && + struct rcu_head **ntp; + + ntp = rdp->nxttail[RCU_DONE_TAIL + + (ACCESS_ONCE(rsp->completed) != rdp->completed)]; + return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp && !rcu_gp_in_progress(rsp); } @@ -1123,6 +1127,7 @@ static void init_callback_list(struct rcu_data *rdp) rdp->nxtlist = NULL; for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; + init_nocb_callback_list(rdp); } /* @@ -1633,6 +1638,10 @@ static void rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + /* No-CBs CPUs do not have orphanable callbacks. */ + if (is_nocb_cpu(rdp->cpu)) + return; + /* * Orphan the callbacks. First adjust the counts. This is safe * because _rcu_barrier() excludes CPU-hotplug operations, so it @@ -1684,6 +1693,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) int i; struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); + /* No-CBs CPUs are handled specially. */ + if (rcu_nocb_adopt_orphan_cbs(rsp, rdp)) + return; + /* Do the accounting first. */ rdp->qlen_lazy += rsp->qlen_lazy; rdp->qlen += rsp->qlen; @@ -2162,9 +2175,15 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, } } +/* + * Helper function for call_rcu() and friends. The cpu argument will + * normally be -1, indicating "currently running CPU". It may specify + * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() + * is expected to specify a CPU. + */ static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), - struct rcu_state *rsp, bool lazy) + struct rcu_state *rsp, int cpu, bool lazy) { unsigned long flags; struct rcu_data *rdp; @@ -2184,9 +2203,14 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rdp = this_cpu_ptr(rsp->rda); /* Add the callback to our list. */ - if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL)) { + if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) { + int offline; + + if (cpu != -1) + rdp = per_cpu_ptr(rsp->rda, cpu); + offline = !__call_rcu_nocb(rdp, head, lazy); + WARN_ON_ONCE(offline); /* _call_rcu() is illegal on offline CPU; leak the callback. */ - WARN_ON_ONCE(1); local_irq_restore(flags); return; } @@ -2215,7 +2239,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), */ void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_sched_state, 0); + __call_rcu(head, func, &rcu_sched_state, -1, 0); } EXPORT_SYMBOL_GPL(call_rcu_sched); @@ -2224,7 +2248,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); */ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_bh_state, 0); + __call_rcu(head, func, &rcu_bh_state, -1, 0); } EXPORT_SYMBOL_GPL(call_rcu_bh); @@ -2676,9 +2700,17 @@ static void _rcu_barrier(struct rcu_state *rsp) * When that callback is invoked, we will know that all of the * corresponding CPU's preceding callbacks have been invoked. */ - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { + if (!cpu_online(cpu) && !is_nocb_cpu(cpu)) + continue; rdp = per_cpu_ptr(rsp->rda, cpu); - if (ACCESS_ONCE(rdp->qlen)) { + if (is_nocb_cpu(cpu)) { + _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, + rsp->n_barrier_done); + atomic_inc(&rsp->barrier_cpu_count); + __call_rcu(&rdp->barrier_head, rcu_barrier_callback, + rsp, cpu, 0); + } else if (ACCESS_ONCE(rdp->qlen)) { _rcu_barrier_trace(rsp, "OnlineQ", cpu, rsp->n_barrier_done); smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); @@ -2752,6 +2784,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) #endif rdp->cpu = cpu; rdp->rsp = rsp; + rcu_boot_init_nocb_percpu_data(rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -2833,6 +2866,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); struct rcu_node *rnp = rdp->mynode; struct rcu_state *rsp; + int ret = NOTIFY_OK; trace_rcu_utilization("Start CPU hotplug"); switch (action) { @@ -2846,7 +2880,10 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, rcu_boost_kthread_setaffinity(rnp, -1); break; case CPU_DOWN_PREPARE: - rcu_boost_kthread_setaffinity(rnp, cpu); + if (nocb_cpu_expendable(cpu)) + rcu_boost_kthread_setaffinity(rnp, cpu); + else + ret = NOTIFY_BAD; break; case CPU_DYING: case CPU_DYING_FROZEN: @@ -2870,7 +2907,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, break; } trace_rcu_utilization("End CPU hotplug"); - return NOTIFY_OK; + return ret; } /* @@ -2890,6 +2927,7 @@ static int __init rcu_spawn_gp_kthread(void) raw_spin_lock_irqsave(&rnp->lock, flags); rsp->gp_kthread = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); + rcu_spawn_nocb_kthreads(rsp); } return 0; } @@ -3085,6 +3123,7 @@ void __init rcu_init(void) rcu_init_one(&rcu_sched_state, &rcu_sched_data); rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); + rcu_init_nocb(); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); /* diff --git a/kernel/rcutree.h b/kernel/rcutree.h index d274af3572103d..488f2ec6b66391 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -317,6 +317,18 @@ struct rcu_data { struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ + /* 7) Callback offloading. */ +#ifdef CONFIG_RCU_NOCB_CPU + struct rcu_head *nocb_head; /* CBs waiting for kthread. */ + struct rcu_head **nocb_tail; + atomic_long_t nocb_q_count; /* # CBs waiting for kthread */ + atomic_long_t nocb_q_count_lazy; /* (approximate). */ + int nocb_p_count; /* # CBs being invoked by kthread */ + int nocb_p_count_lazy; /* (approximate). */ + wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ + struct task_struct *nocb_kthread; +#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ + int cpu; struct rcu_state *rsp; }; @@ -369,6 +381,12 @@ struct rcu_state { struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ void (*func)(struct rcu_head *head)); +#ifdef CONFIG_RCU_NOCB_CPU + void (*call_remote)(struct rcu_head *head, + void (*func)(struct rcu_head *head)); + /* call_rcu() flavor, but for */ + /* placing on remote CPU. */ +#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ /* The following fields are guarded by the root rcu_node's lock. */ @@ -439,6 +457,8 @@ struct rcu_state { #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ extern struct list_head rcu_struct_flavors; + +/* Sequence through rcu_state structures for each RCU flavor. */ #define for_each_rcu_flavor(rsp) \ list_for_each_entry((rsp), &rcu_struct_flavors, flavors) @@ -515,5 +535,32 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); +static bool is_nocb_cpu(int cpu); +static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, + bool lazy); +static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, + struct rcu_data *rdp); +static bool nocb_cpu_expendable(int cpu); +static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); +static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); +static void init_nocb_callback_list(struct rcu_data *rdp); +static void __init rcu_init_nocb(void); #endif /* #ifndef RCU_TREE_NONCORE */ + +#ifdef CONFIG_RCU_TRACE +#ifdef CONFIG_RCU_NOCB_CPU +/* Sum up queue lengths for tracing. */ +static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) +{ + *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count; + *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy; +} +#else /* #ifdef CONFIG_RCU_NOCB_CPU */ +static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) +{ + *ql = 0; + *qll = 0; +} +#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ +#endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 5ce3352505e94f..6cdc372de34c41 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -25,6 +25,7 @@ */ #include +#include #include #include @@ -36,6 +37,14 @@ #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO #endif +#ifdef CONFIG_RCU_NOCB_CPU +static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ +static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ +static bool rcu_nocb_poll; /* Offload kthread are to poll. */ +module_param(rcu_nocb_poll, bool, 0444); +static char __initdata nocb_buf[NR_CPUS * 5]; +#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ + /* * Check the RCU kernel configuration parameters and print informative * messages about anything out of the ordinary. If you like #ifdef, you @@ -76,6 +85,18 @@ static void __init rcu_bootup_announce_oddness(void) printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); +#ifdef CONFIG_RCU_NOCB_CPU + if (have_rcu_nocb_mask) { + if (cpumask_test_cpu(0, rcu_nocb_mask)) { + cpumask_clear_cpu(0, rcu_nocb_mask); + pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n"); + } + cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); + pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); + if (rcu_nocb_poll) + pr_info("\tExperimental polled no-CBs CPUs.\n"); + } +#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ } #ifdef CONFIG_TREE_PREEMPT_RCU @@ -642,7 +663,7 @@ static void rcu_preempt_do_callbacks(void) */ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_preempt_state, 0); + __call_rcu(head, func, &rcu_preempt_state, -1, 0); } EXPORT_SYMBOL_GPL(call_rcu); @@ -656,7 +677,7 @@ EXPORT_SYMBOL_GPL(call_rcu); void kfree_call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_preempt_state, 1); + __call_rcu(head, func, &rcu_preempt_state, -1, 1); } EXPORT_SYMBOL_GPL(kfree_call_rcu); @@ -1025,7 +1046,7 @@ static void rcu_preempt_check_callbacks(int cpu) void kfree_call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_sched_state, 1); + __call_rcu(head, func, &rcu_sched_state, -1, 1); } EXPORT_SYMBOL_GPL(kfree_call_rcu); @@ -2104,3 +2125,373 @@ static void increment_cpu_stall_ticks(void) } #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ + +#ifdef CONFIG_RCU_NOCB_CPU + +/* + * Offload callback processing from the boot-time-specified set of CPUs + * specified by rcu_nocb_mask. For each CPU in the set, there is a + * kthread created that pulls the callbacks from the corresponding CPU, + * waits for a grace period to elapse, and invokes the callbacks. + * The no-CBs CPUs do a wake_up() on their kthread when they insert + * a callback into any empty list, unless the rcu_nocb_poll boot parameter + * has been specified, in which case each kthread actively polls its + * CPU. (Which isn't so great for energy efficiency, but which does + * reduce RCU's overhead on that CPU.) + * + * This is intended to be used in conjunction with Frederic Weisbecker's + * adaptive-idle work, which would seriously reduce OS jitter on CPUs + * running CPU-bound user-mode computations. + * + * Offloading of callback processing could also in theory be used as + * an energy-efficiency measure because CPUs with no RCU callbacks + * queued are more aggressive about entering dyntick-idle mode. + */ + + +/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */ +static int __init rcu_nocb_setup(char *str) +{ + alloc_bootmem_cpumask_var(&rcu_nocb_mask); + have_rcu_nocb_mask = true; + cpulist_parse(str, rcu_nocb_mask); + return 1; +} +__setup("rcu_nocbs=", rcu_nocb_setup); + +/* Is the specified CPU a no-CPUs CPU? */ +static bool is_nocb_cpu(int cpu) +{ + if (have_rcu_nocb_mask) + return cpumask_test_cpu(cpu, rcu_nocb_mask); + return false; +} + +/* + * Enqueue the specified string of rcu_head structures onto the specified + * CPU's no-CBs lists. The CPU is specified by rdp, the head of the + * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy + * counts are supplied by rhcount and rhcount_lazy. + * + * If warranted, also wake up the kthread servicing this CPUs queues. + */ +static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, + struct rcu_head *rhp, + struct rcu_head **rhtp, + int rhcount, int rhcount_lazy) +{ + int len; + struct rcu_head **old_rhpp; + struct task_struct *t; + + /* Enqueue the callback on the nocb list and update counts. */ + old_rhpp = xchg(&rdp->nocb_tail, rhtp); + ACCESS_ONCE(*old_rhpp) = rhp; + atomic_long_add(rhcount, &rdp->nocb_q_count); + atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); + + /* If we are not being polled and there is a kthread, awaken it ... */ + t = ACCESS_ONCE(rdp->nocb_kthread); + if (rcu_nocb_poll | !t) + return; + len = atomic_long_read(&rdp->nocb_q_count); + if (old_rhpp == &rdp->nocb_head) { + wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ + rdp->qlen_last_fqs_check = 0; + } else if (len > rdp->qlen_last_fqs_check + qhimark) { + wake_up_process(t); /* ... or if many callbacks queued. */ + rdp->qlen_last_fqs_check = LONG_MAX / 2; + } + return; +} + +/* + * This is a helper for __call_rcu(), which invokes this when the normal + * callback queue is inoperable. If this is not a no-CBs CPU, this + * function returns failure back to __call_rcu(), which can complain + * appropriately. + * + * Otherwise, this function queues the callback where the corresponding + * "rcuo" kthread can find it. + */ +static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, + bool lazy) +{ + + if (!is_nocb_cpu(rdp->cpu)) + return 0; + __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); + return 1; +} + +/* + * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is + * not a no-CBs CPU. + */ +static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, + struct rcu_data *rdp) +{ + long ql = rsp->qlen; + long qll = rsp->qlen_lazy; + + /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ + if (!is_nocb_cpu(smp_processor_id())) + return 0; + rsp->qlen = 0; + rsp->qlen_lazy = 0; + + /* First, enqueue the donelist, if any. This preserves CB ordering. */ + if (rsp->orphan_donelist != NULL) { + __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, + rsp->orphan_donetail, ql, qll); + ql = qll = 0; + rsp->orphan_donelist = NULL; + rsp->orphan_donetail = &rsp->orphan_donelist; + } + if (rsp->orphan_nxtlist != NULL) { + __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, + rsp->orphan_nxttail, ql, qll); + ql = qll = 0; + rsp->orphan_nxtlist = NULL; + rsp->orphan_nxttail = &rsp->orphan_nxtlist; + } + return 1; +} + +/* + * There must be at least one non-no-CBs CPU in operation at any given + * time, because no-CBs CPUs are not capable of initiating grace periods + * independently. This function therefore complains if the specified + * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to + * avoid offlining the last such CPU. (Recursion is a wonderful thing, + * but you have to have a base case!) + */ +static bool nocb_cpu_expendable(int cpu) +{ + cpumask_var_t non_nocb_cpus; + int ret; + + /* + * If there are no no-CB CPUs or if this CPU is not a no-CB CPU, + * then offlining this CPU is harmless. Let it happen. + */ + if (!have_rcu_nocb_mask || is_nocb_cpu(cpu)) + return 1; + + /* If no memory, play it safe and keep the CPU around. */ + if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO)) + return 0; + cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask); + cpumask_clear_cpu(cpu, non_nocb_cpus); + ret = !cpumask_empty(non_nocb_cpus); + free_cpumask_var(non_nocb_cpus); + return ret; +} + +/* + * Helper structure for remote registry of RCU callbacks. + * This is needed for when a no-CBs CPU needs to start a grace period. + * If it just invokes call_rcu(), the resulting callback will be queued, + * which can result in deadlock. + */ +struct rcu_head_remote { + struct rcu_head *rhp; + call_rcu_func_t *crf; + void (*func)(struct rcu_head *rhp); +}; + +/* + * Register a callback as specified by the rcu_head_remote struct. + * This function is intended to be invoked via smp_call_function_single(). + */ +static void call_rcu_local(void *arg) +{ + struct rcu_head_remote *rhrp = + container_of(arg, struct rcu_head_remote, rhp); + + rhrp->crf(rhrp->rhp, rhrp->func); +} + +/* + * Set up an rcu_head_remote structure and the invoke call_rcu_local() + * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via + * smp_call_function_single(). + */ +static void invoke_crf_remote(struct rcu_head *rhp, + void (*func)(struct rcu_head *rhp), + call_rcu_func_t crf) +{ + struct rcu_head_remote rhr; + + rhr.rhp = rhp; + rhr.crf = crf; + rhr.func = func; + smp_call_function_single(0, call_rcu_local, &rhr, 1); +} + +/* + * Helper functions to be passed to wait_rcu_gp(), each of which + * invokes invoke_crf_remote() to register a callback appropriately. + */ +static void __maybe_unused +call_rcu_preempt_remote(struct rcu_head *rhp, + void (*func)(struct rcu_head *rhp)) +{ + invoke_crf_remote(rhp, func, call_rcu); +} +static void call_rcu_bh_remote(struct rcu_head *rhp, + void (*func)(struct rcu_head *rhp)) +{ + invoke_crf_remote(rhp, func, call_rcu_bh); +} +static void call_rcu_sched_remote(struct rcu_head *rhp, + void (*func)(struct rcu_head *rhp)) +{ + invoke_crf_remote(rhp, func, call_rcu_sched); +} + +/* + * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes + * callbacks queued by the corresponding no-CBs CPU. + */ +static int rcu_nocb_kthread(void *arg) +{ + int c, cl; + struct rcu_head *list; + struct rcu_head *next; + struct rcu_head **tail; + struct rcu_data *rdp = arg; + + /* Each pass through this loop invokes one batch of callbacks */ + for (;;) { + /* If not polling, wait for next batch of callbacks. */ + if (!rcu_nocb_poll) + wait_event(rdp->nocb_wq, rdp->nocb_head); + list = ACCESS_ONCE(rdp->nocb_head); + if (!list) { + schedule_timeout_interruptible(1); + continue; + } + + /* + * Extract queued callbacks, update counts, and wait + * for a grace period to elapse. + */ + ACCESS_ONCE(rdp->nocb_head) = NULL; + tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); + c = atomic_long_xchg(&rdp->nocb_q_count, 0); + cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); + ACCESS_ONCE(rdp->nocb_p_count) += c; + ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; + wait_rcu_gp(rdp->rsp->call_remote); + + /* Each pass through the following loop invokes a callback. */ + trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); + c = cl = 0; + while (list) { + next = list->next; + /* Wait for enqueuing to complete, if needed. */ + while (next == NULL && &list->next != tail) { + schedule_timeout_interruptible(1); + next = list->next; + } + debug_rcu_head_unqueue(list); + local_bh_disable(); + if (__rcu_reclaim(rdp->rsp->name, list)) + cl++; + c++; + local_bh_enable(); + list = next; + } + trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); + ACCESS_ONCE(rdp->nocb_p_count) -= c; + ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl; + rdp->n_cbs_invoked += c; + } + return 0; +} + +/* Initialize per-rcu_data variables for no-CBs CPUs. */ +static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) +{ + rdp->nocb_tail = &rdp->nocb_head; + init_waitqueue_head(&rdp->nocb_wq); +} + +/* Create a kthread for each RCU flavor for each no-CBs CPU. */ +static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) +{ + int cpu; + struct rcu_data *rdp; + struct task_struct *t; + + if (rcu_nocb_mask == NULL) + return; + for_each_cpu(cpu, rcu_nocb_mask) { + rdp = per_cpu_ptr(rsp->rda, cpu); + t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu); + BUG_ON(IS_ERR(t)); + ACCESS_ONCE(rdp->nocb_kthread) = t; + } +} + +/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ +static void init_nocb_callback_list(struct rcu_data *rdp) +{ + if (rcu_nocb_mask == NULL || + !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) + return; + rdp->nxttail[RCU_NEXT_TAIL] = NULL; +} + +/* Initialize the ->call_remote fields in the rcu_state structures. */ +static void __init rcu_init_nocb(void) +{ +#ifdef CONFIG_PREEMPT_RCU + rcu_preempt_state.call_remote = call_rcu_preempt_remote; +#endif /* #ifdef CONFIG_PREEMPT_RCU */ + rcu_bh_state.call_remote = call_rcu_bh_remote; + rcu_sched_state.call_remote = call_rcu_sched_remote; +} + +#else /* #ifdef CONFIG_RCU_NOCB_CPU */ + +static bool is_nocb_cpu(int cpu) +{ + return false; +} + +static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, + bool lazy) +{ + return 0; +} + +static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, + struct rcu_data *rdp) +{ + return 0; +} + +static bool nocb_cpu_expendable(int cpu) +{ + return 1; +} + +static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) +{ +} + +static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) +{ +} + +static void init_nocb_callback_list(struct rcu_data *rdp) +{ +} + +static void __init rcu_init_nocb(void) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index f9512687a6e529..3189f9aa3e84a2 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -113,6 +113,8 @@ static char convert_kthread_status(unsigned int kthread_status) static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) { + long ql, qll; + if (!rdp->beenonline) return; seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d", @@ -126,8 +128,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); seq_printf(m, " of=%lu", rdp->offline_fqs); + rcu_nocb_q_lengths(rdp, &ql, &qll); + qll += rdp->qlen_lazy; + ql += rdp->qlen; seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", - rdp->qlen_lazy, rdp->qlen, + qll, ql, ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]], ".R"[rdp->nxttail[RCU_WAIT_TAIL] != From c635a4e1c24e9396db10ed7424b2084908b32252 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Oct 2012 07:29:20 -0700 Subject: [PATCH 47/49] rcu: Separate accounting of callbacks from callback-free CPUs Currently, callback invocations from callback-free CPUs are accounted to the CPU that registered the callback, but using the same field that is used for normal callbacks. This makes it impossible to determine from debugfs output whether callbacks are in fact being diverted. This commit therefore adds a separate ->n_nocbs_invoked field in the rcu_data structure in which diverted callback invocations are counted. RCU's debugfs tracing still displays normal callback invocations using ci=, but displayed diverted callbacks with nci=. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.h | 1 + kernel/rcutree_plugin.h | 2 +- kernel/rcutree_trace.c | 5 +++-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 488f2ec6b66391..4b69291b093d71 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -287,6 +287,7 @@ struct rcu_data { long qlen_last_fqs_check; /* qlen at last check for QS forcing */ unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ + unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */ unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ unsigned long n_force_qs_snap; diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 6cdc372de34c41..f6e5ec2932b4ae 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2406,7 +2406,7 @@ static int rcu_nocb_kthread(void *arg) trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); ACCESS_ONCE(rdp->nocb_p_count) -= c; ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl; - rdp->n_cbs_invoked += c; + rdp->n_nocbs_invoked += c; } return 0; } diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 3189f9aa3e84a2..0d095dcaa6708b 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -148,8 +148,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); #endif /* #ifdef CONFIG_RCU_BOOST */ seq_printf(m, " b=%ld", rdp->blimit); - seq_printf(m, " ci=%lu co=%lu ca=%lu\n", - rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); + seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n", + rdp->n_cbs_invoked, rdp->n_nocbs_invoked, + rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } static int show_rcudata(struct seq_file *m, void *v) From 4e79752c25ec221ac1e28f8875b539ed7631a0db Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Nov 2012 13:35:32 -0800 Subject: [PATCH 48/49] sched: Mark RCU reader in sched_show_task() When sched_show_task() is invoked from try_to_freeze_tasks(), there is no RCU read-side critical section, resulting in the following splat: [ 125.780730] =============================== [ 125.780766] [ INFO: suspicious RCU usage. ] [ 125.780804] 3.7.0-rc3+ #988 Not tainted [ 125.780838] ------------------------------- [ 125.780875] /home/rafael/src/linux/kernel/sched/core.c:4497 suspicious rcu_dereference_check() usage! [ 125.780946] [ 125.780946] other info that might help us debug this: [ 125.780946] [ 125.781031] [ 125.781031] rcu_scheduler_active = 1, debug_locks = 0 [ 125.781087] 4 locks held by s2ram/4211: [ 125.781120] #0: (&buffer->mutex){+.+.+.}, at: [] sysfs_write_file+0x3f/0x160 [ 125.781233] #1: (s_active#94){.+.+.+}, at: [] sysfs_write_file+0xc8/0x160 [ 125.781339] #2: (pm_mutex){+.+.+.}, at: [] pm_suspend+0x81/0x230 [ 125.781439] #3: (tasklist_lock){.?.?..}, at: [] try_to_freeze_tasks+0x2cd/0x3f0 [ 125.781543] [ 125.781543] stack backtrace: [ 125.781584] Pid: 4211, comm: s2ram Not tainted 3.7.0-rc3+ #988 [ 125.781632] Call Trace: [ 125.781662] [] lockdep_rcu_suspicious+0x103/0x140 [ 125.781719] [] sched_show_task+0x121/0x180 [ 125.781770] [] try_to_freeze_tasks+0x394/0x3f0 [ 125.781823] [] freeze_kernel_threads+0x25/0x80 [ 125.781876] [] pm_suspend+0x165/0x230 [ 125.781924] [] state_store+0x99/0x100 [ 125.781975] [] kobj_attr_store+0x17/0x20 [ 125.782038] [] sysfs_write_file+0xe1/0x160 [ 125.782091] [] vfs_write+0xc6/0x180 [ 125.782138] [] sys_write+0x5a/0xa0 [ 125.782185] [] ? trace_hardirqs_on_thunk+0x3a/0x3f [ 125.782242] [] system_call_fastpath+0x16/0x1b This commit therefore adds the needed RCU read-side critical section. Reported-by: "Rafael J. Wysocki" Signed-off-by: Paul E. McKenney --- kernel/sched/core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6d4569e0924d2f..36f260864f65df 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4474,6 +4474,7 @@ static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; void sched_show_task(struct task_struct *p) { unsigned long free = 0; + int ppid; unsigned state; state = p->state ? __ffs(p->state) + 1 : 0; @@ -4493,8 +4494,11 @@ void sched_show_task(struct task_struct *p) #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif + rcu_read_lock(); + ppid = task_pid_nr(rcu_dereference(p->real_parent)); + rcu_read_unlock(); printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, - task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)), + task_pid_nr(p), ppid, (unsigned long)task_thread_info(p)->flags); show_stack(p, NULL); From 91d1aa43d30505b0b825db8898ffc80a8eca96c7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 27 Nov 2012 19:33:25 +0100 Subject: [PATCH 49/49] context_tracking: New context tracking susbsystem Create a new subsystem that probes on kernel boundaries to keep track of the transitions between level contexts with two basic initial contexts: user or kernel. This is an abstraction of some RCU code that use such tracking to implement its userspace extended quiescent state. We need to pull this up from RCU into this new level of indirection because this tracking is also going to be used to implement an "on demand" generic virtual cputime accounting. A necessary step to shutdown the tick while still accounting the cputime. Signed-off-by: Frederic Weisbecker Cc: Andrew Morton Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Li Zhong Cc: Gilad Ben-Yossef Reviewed-by: Steven Rostedt [ paulmck: fix whitespace error and email address. ] Signed-off-by: Paul E. McKenney --- arch/Kconfig | 15 ++-- arch/x86/Kconfig | 2 +- .../include/asm/{rcu.h => context_tracking.h} | 15 ++-- arch/x86/kernel/entry_64.S | 2 +- arch/x86/kernel/ptrace.c | 8 +- arch/x86/kernel/signal.c | 5 +- arch/x86/kernel/traps.c | 2 +- arch/x86/mm/fault.c | 2 +- include/linux/context_tracking.h | 18 ++++ include/linux/rcupdate.h | 2 - init/Kconfig | 28 +++---- kernel/Makefile | 1 + kernel/context_tracking.c | 83 +++++++++++++++++++ kernel/rcutree.c | 64 +------------- kernel/sched/core.c | 11 +-- 15 files changed, 150 insertions(+), 108 deletions(-) rename arch/x86/include/asm/{rcu.h => context_tracking.h} (63%) create mode 100644 include/linux/context_tracking.h create mode 100644 kernel/context_tracking.c diff --git a/arch/Kconfig b/arch/Kconfig index 366ec06a5185f0..cc74aaea116c8c 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -300,15 +300,16 @@ config SECCOMP_FILTER See Documentation/prctl/seccomp_filter.txt for details. -config HAVE_RCU_USER_QS +config HAVE_CONTEXT_TRACKING bool help - Provide kernel entry/exit hooks necessary for userspace - RCU extended quiescent state. Syscalls need to be wrapped inside - rcu_user_exit()-rcu_user_enter() through the slow path using - TIF_NOHZ flag. Exceptions handlers must be wrapped as well. Irqs - are already protected inside rcu_irq_enter/rcu_irq_exit() but - preemption or signal handling on irq exit still need to be protected. + Provide kernel/user boundaries probes necessary for subsystems + that need it, such as userspace RCU extended quiescent state. + Syscalls need to be wrapped inside user_exit()-user_enter() through + the slow path using TIF_NOHZ flag. Exceptions handlers must be + wrapped as well. Irqs are already protected inside + rcu_irq_enter/rcu_irq_exit() but preemption or signal handling on + irq exit still need to be protected. config HAVE_VIRT_CPU_ACCOUNTING bool diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 46c3bff3ced20e..110cfad24f2664 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -106,7 +106,7 @@ config X86 select KTIME_SCALAR if X86_32 select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER - select HAVE_RCU_USER_QS if X86_64 + select HAVE_CONTEXT_TRACKING if X86_64 select HAVE_IRQ_TIME_ACCOUNTING select GENERIC_KERNEL_THREAD select GENERIC_KERNEL_EXECVE diff --git a/arch/x86/include/asm/rcu.h b/arch/x86/include/asm/context_tracking.h similarity index 63% rename from arch/x86/include/asm/rcu.h rename to arch/x86/include/asm/context_tracking.h index d1ac07a23979fd..1616562683e9a5 100644 --- a/arch/x86/include/asm/rcu.h +++ b/arch/x86/include/asm/context_tracking.h @@ -1,27 +1,26 @@ -#ifndef _ASM_X86_RCU_H -#define _ASM_X86_RCU_H +#ifndef _ASM_X86_CONTEXT_TRACKING_H +#define _ASM_X86_CONTEXT_TRACKING_H #ifndef __ASSEMBLY__ - -#include +#include #include static inline void exception_enter(struct pt_regs *regs) { - rcu_user_exit(); + user_exit(); } static inline void exception_exit(struct pt_regs *regs) { -#ifdef CONFIG_RCU_USER_QS +#ifdef CONFIG_CONTEXT_TRACKING if (user_mode(regs)) - rcu_user_enter(); + user_enter(); #endif } #else /* __ASSEMBLY__ */ -#ifdef CONFIG_RCU_USER_QS +#ifdef CONFIG_CONTEXT_TRACKING # define SCHEDULE_USER call schedule_user #else # define SCHEDULE_USER call schedule diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 0c58952d64e872..98faeb30139d70 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -56,7 +56,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index eff5b8c68652d5..65b88a5dc1a8a2 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include @@ -1461,7 +1461,7 @@ long syscall_trace_enter(struct pt_regs *regs) { long ret = 0; - rcu_user_exit(); + user_exit(); /* * If we stepped into a sysenter/syscall insn, it trapped in @@ -1516,7 +1516,7 @@ void syscall_trace_leave(struct pt_regs *regs) * or do_notify_resume(), in which case we can be in RCU * user mode. */ - rcu_user_exit(); + user_exit(); audit_syscall_exit(regs); @@ -1534,5 +1534,5 @@ void syscall_trace_leave(struct pt_regs *regs) if (step || test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall_exit(regs, step); - rcu_user_enter(); + user_enter(); } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 29ad351804e932..20ecac112e743a 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -816,7 +817,7 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { - rcu_user_exit(); + user_exit(); #ifdef CONFIG_X86_MCE /* notify userspace of pending MCEs */ @@ -840,7 +841,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) fire_user_return_notifiers(); - rcu_user_enter(); + user_enter(); } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 8276dc6794ccee..eb8586693e0b7c 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -55,7 +55,7 @@ #include #include #include -#include +#include #include diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8e13ecb41bee02..7a529cbab7ad1d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -18,7 +18,7 @@ #include /* pgd_*(), ... */ #include /* kmemcheck_*(), ... */ #include /* VSYSCALL_START */ -#include /* exception_enter(), ... */ +#include /* exception_enter(), ... */ /* * Page fault error code bits: diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h new file mode 100644 index 00000000000000..e24339ccb7f04a --- /dev/null +++ b/include/linux/context_tracking.h @@ -0,0 +1,18 @@ +#ifndef _LINUX_CONTEXT_TRACKING_H +#define _LINUX_CONTEXT_TRACKING_H + +#ifdef CONFIG_CONTEXT_TRACKING +#include + +extern void user_enter(void); +extern void user_exit(void); +extern void context_tracking_task_switch(struct task_struct *prev, + struct task_struct *next); +#else +static inline void user_enter(void) { } +static inline void user_exit(void) { } +static inline void context_tracking_task_switch(struct task_struct *prev, + struct task_struct *next) { } +#endif /* !CONFIG_CONTEXT_TRACKING */ + +#endif diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 8fe7c1840d3076..275aa3f1062d4b 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -222,8 +222,6 @@ extern void rcu_user_enter(void); extern void rcu_user_exit(void); extern void rcu_user_enter_after_irq(void); extern void rcu_user_exit_after_irq(void); -extern void rcu_user_hooks_switch(struct task_struct *prev, - struct task_struct *next); #else static inline void rcu_user_enter(void) { } static inline void rcu_user_exit(void) { } diff --git a/init/Kconfig b/init/Kconfig index 5ac6ee094225dc..2054e048bb9844 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -486,9 +486,13 @@ config PREEMPT_RCU This option enables preemptible-RCU code that is common between the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations. +config CONTEXT_TRACKING + bool + config RCU_USER_QS bool "Consider userspace as in RCU extended quiescent state" - depends on HAVE_RCU_USER_QS && SMP + depends on HAVE_CONTEXT_TRACKING && SMP + select CONTEXT_TRACKING help This option sets hooks on kernel / userspace boundaries and puts RCU in extended quiescent state when the CPU runs in @@ -497,24 +501,20 @@ config RCU_USER_QS try to keep the timer tick on for RCU. Unless you want to hack and help the development of the full - tickless feature, you shouldn't enable this option. It also + dynticks mode, you shouldn't enable this option. It also adds unnecessary overhead. If unsure say N -config RCU_USER_QS_FORCE - bool "Force userspace extended QS by default" - depends on RCU_USER_QS +config CONTEXT_TRACKING_FORCE + bool "Force context tracking" + depends on CONTEXT_TRACKING help - Set the hooks in user/kernel boundaries by default in order to - test this feature that treats userspace as an extended quiescent - state until we have a real user like a full adaptive nohz option. - - Unless you want to hack and help the development of the full - tickless feature, you shouldn't enable this option. It adds - unnecessary overhead. - - If unsure say N + Probe on user/kernel boundaries by default in order to + test the features that rely on it such as userspace RCU extended + quiescent states. + This test is there for debugging until we have a real user like the + full dynticks mode. config RCU_FANOUT int "Tree-based hierarchical RCU fanout value" diff --git a/kernel/Makefile b/kernel/Makefile index 0dfeca4324ee06..f90bbfc9727fc2 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -110,6 +110,7 @@ obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o obj-$(CONFIG_PADATA) += padata.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o +obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o $(obj)/configs.o: $(obj)/config_data.h diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c new file mode 100644 index 00000000000000..e0e07fd55508ee --- /dev/null +++ b/kernel/context_tracking.c @@ -0,0 +1,83 @@ +#include +#include +#include +#include +#include + +struct context_tracking { + /* + * When active is false, hooks are not set to + * minimize overhead: TIF flags are cleared + * and calls to user_enter/exit are ignored. This + * may be further optimized using static keys. + */ + bool active; + enum { + IN_KERNEL = 0, + IN_USER, + } state; +}; + +static DEFINE_PER_CPU(struct context_tracking, context_tracking) = { +#ifdef CONFIG_CONTEXT_TRACKING_FORCE + .active = true, +#endif +}; + +void user_enter(void) +{ + unsigned long flags; + + /* + * Some contexts may involve an exception occuring in an irq, + * leading to that nesting: + * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() + * This would mess up the dyntick_nesting count though. And rcu_irq_*() + * helpers are enough to protect RCU uses inside the exception. So + * just return immediately if we detect we are in an IRQ. + */ + if (in_interrupt()) + return; + + WARN_ON_ONCE(!current->mm); + + local_irq_save(flags); + if (__this_cpu_read(context_tracking.active) && + __this_cpu_read(context_tracking.state) != IN_USER) { + __this_cpu_write(context_tracking.state, IN_USER); + rcu_user_enter(); + } + local_irq_restore(flags); +} + +void user_exit(void) +{ + unsigned long flags; + + /* + * Some contexts may involve an exception occuring in an irq, + * leading to that nesting: + * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() + * This would mess up the dyntick_nesting count though. And rcu_irq_*() + * helpers are enough to protect RCU uses inside the exception. So + * just return immediately if we detect we are in an IRQ. + */ + if (in_interrupt()) + return; + + local_irq_save(flags); + if (__this_cpu_read(context_tracking.state) == IN_USER) { + __this_cpu_write(context_tracking.state, IN_KERNEL); + rcu_user_exit(); + } + local_irq_restore(flags); +} + +void context_tracking_task_switch(struct task_struct *prev, + struct task_struct *next) +{ + if (__this_cpu_read(context_tracking.active)) { + clear_tsk_thread_flag(prev, TIF_NOHZ); + set_tsk_thread_flag(next, TIF_NOHZ); + } +} diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 7733eb56e156a8..e441b77b614e1e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -207,9 +207,6 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, .dynticks = ATOMIC_INIT(1), -#if defined(CONFIG_RCU_USER_QS) && !defined(CONFIG_RCU_USER_QS_FORCE) - .ignore_user_qs = true, -#endif }; static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ @@ -420,29 +417,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); */ void rcu_user_enter(void) { - unsigned long flags; - struct rcu_dynticks *rdtp; - - /* - * Some contexts may involve an exception occuring in an irq, - * leading to that nesting: - * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() - * This would mess up the dyntick_nesting count though. And rcu_irq_*() - * helpers are enough to protect RCU uses inside the exception. So - * just return immediately if we detect we are in an IRQ. - */ - if (in_interrupt()) - return; - - WARN_ON_ONCE(!current->mm); - - local_irq_save(flags); - rdtp = &__get_cpu_var(rcu_dynticks); - if (!rdtp->ignore_user_qs && !rdtp->in_user) { - rdtp->in_user = true; - rcu_eqs_enter(true); - } - local_irq_restore(flags); + rcu_eqs_enter(1); } /** @@ -579,27 +554,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit); */ void rcu_user_exit(void) { - unsigned long flags; - struct rcu_dynticks *rdtp; - - /* - * Some contexts may involve an exception occuring in an irq, - * leading to that nesting: - * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() - * This would mess up the dyntick_nesting count though. And rcu_irq_*() - * helpers are enough to protect RCU uses inside the exception. So - * just return immediately if we detect we are in an IRQ. - */ - if (in_interrupt()) - return; - - local_irq_save(flags); - rdtp = &__get_cpu_var(rcu_dynticks); - if (rdtp->in_user) { - rdtp->in_user = false; - rcu_eqs_exit(true); - } - local_irq_restore(flags); + rcu_eqs_exit(1); } /** @@ -722,21 +677,6 @@ int rcu_is_cpu_idle(void) } EXPORT_SYMBOL(rcu_is_cpu_idle); -#ifdef CONFIG_RCU_USER_QS -void rcu_user_hooks_switch(struct task_struct *prev, - struct task_struct *next) -{ - struct rcu_dynticks *rdtp; - - /* Interrupts are disabled in context switch */ - rdtp = &__get_cpu_var(rcu_dynticks); - if (!rdtp->ignore_user_qs) { - clear_tsk_thread_flag(prev, TIF_NOHZ); - set_tsk_thread_flag(next, TIF_NOHZ); - } -} -#endif /* #ifdef CONFIG_RCU_USER_QS */ - #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 36f260864f65df..80f80dfca70e17 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -72,6 +72,7 @@ #include #include #include +#include #include #include @@ -1886,8 +1887,8 @@ context_switch(struct rq *rq, struct task_struct *prev, spin_release(&rq->lock.dep_map, 1, _THIS_IP_); #endif + context_tracking_task_switch(prev, next); /* Here we just switch the register state and the stack. */ - rcu_user_hooks_switch(prev, next); switch_to(prev, next, prev); barrier(); @@ -2911,7 +2912,7 @@ asmlinkage void __sched schedule(void) } EXPORT_SYMBOL(schedule); -#ifdef CONFIG_RCU_USER_QS +#ifdef CONFIG_CONTEXT_TRACKING asmlinkage void __sched schedule_user(void) { /* @@ -2920,9 +2921,9 @@ asmlinkage void __sched schedule_user(void) * we haven't yet exited the RCU idle mode. Do it here manually until * we find a better solution. */ - rcu_user_exit(); + user_exit(); schedule(); - rcu_user_enter(); + user_enter(); } #endif @@ -3027,7 +3028,7 @@ asmlinkage void __sched preempt_schedule_irq(void) /* Catch callers which need to be fixed */ BUG_ON(ti->preempt_count || !irqs_disabled()); - rcu_user_exit(); + user_exit(); do { add_preempt_count(PREEMPT_ACTIVE); local_irq_enable();